In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'Device: cuda'


## Loading models

### Loading BART
* https://huggingface.co/facebook/bart-large

In [4]:
from transformers import BartTokenizer, AutoModelForSeq2SeqLM

model_name = "bart-large"
model_repo = f"facebook"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
print(type(model))

model_name: bart-large,model_id: facebook/bart-large,model_path: facebook_bart-large
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


## Loading dataset

In [5]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

train_ratio = 0.001
test_ratio = 0.0001

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
# dataset = dataset.map(
#     lambda item: {
#         "input": item["src"],
#         "reference": item["tgt"],
#         "references": [item["tgt"]],
#     },
#     remove_columns=["src", "tgt", "_id"],
# )
print(dataset)
print(dataset["train"][0])

train set {'paraphrase', 'neutralize', 'clarity', 'gec', 'simplification', 'coherence'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 68
})
test set {'paraphrase', 'neutralize', 'clarity', 'gec', 'simplification', 'coherence'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 12
})
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 68
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 12
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to t

In [6]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length train: {max_input_length}")
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max_input_length test: {max_input_length}")

max_input_length train: 55
max_input_length test: 96


In [7]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, list in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(list)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/paraphrase: 15
train/neutralize: 11
train/clarity: 1
train/gec: 20
train/simplification: 11
train/coherence: 10


In [19]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, list in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(list)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

DatasetDict({
    paraphrase: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 2
    })
    neutralize: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 2
    })
    clarity: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 1
    })
    gec: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 3
    })
    simplification: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 2
    })
    coherence: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 2
    })
})


In [9]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

print(f"pad_token_id: {tokenizer.pad_token_id}")
print(f"padding_side: {tokenizer.padding_side}")


def process_dataset(batch):
    model_inputs = tokenizer(batch["input"], max_length=1024, padding=True)
    labels = tokenizer(text_target=batch["reference"], max_length=1024, padding=True).input_ids
    # model_inputs = tokenizer(batch["input"], padding=True)
    # labels = tokenizer(text_target=batch["reference"], padding=True).input_ids
    # model_inputs = tokenizer(batch["input"])
    # labels = tokenizer(text_target=batch["reference"]).input_ids
    model_inputs["labels"] = labels
    return model_inputs


# processed_dataset = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10, remove_columns=train_dataset.column_names)
# processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10)
print(processed_dataset)
print(len(processed_dataset["train"]["input_ids"][0]), processed_dataset["train"]["input_ids"][0])
print(len(processed_dataset["train"]["attention_mask"][0]), processed_dataset["train"]["attention_mask"][0])
print(len(processed_dataset["train"]["labels"][0]), processed_dataset["train"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", model=model)
# data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

# dataloader = DataLoader(processed_dataset, batch_size=5, collate_fn=data_collator)
# for batch in dataloader:
#     print(batch)
#     break

pad_token_id: 2
padding_side: right
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 68
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12
    })
})
49 [0, 47583, 70, 25187, 45816, 9126, 31, 42, 2788, 35, 286, 1246, 6, 749, 19, 10, 319, 9, 38905, 64, 8470, 763, 3899, 49, 10348, 7, 712, 49, 40752, 1212, 8, 634, 20500, 7, 694, 2382, 514, 7, 5, 10348, 4, 2, 2, 2, 2, 2, 2, 2, 2]
49 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
40 [0, 2709, 1246, 6, 749, 19, 10, 319, 9, 38905, 64, 7891, 49, 10348, 7, 712, 49, 40752, 1212, 8, 304, 20500, 7, 694, 2382, 514, 7, 5, 10348, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


## Training

In [10]:
%reload_ext autoreload

training_model_repo = f"iliazlobin"
training_model_name: str = f"{model_name}-coedit"
training_model_id = f"{training_model_repo}/{training_model_name}"
training_model_checkpoint = f"{training_model_id}"
training_model_path = f"{training_model_repo}_{training_model_name}"
print(
    f"training_model_name: {training_model_name}, "
    f"training_model_id: {training_model_id}, "
    f"training_model_path: {training_model_path}"
)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.95
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

training_model_name: bart-large-coedit, training_model_id: iliazlobin/bart-large-coedit, training_model_path: iliazlobin_bart-large-coedit
total/used/cuda/res/ram (Gb): 10.00/2.99/1.51/1.91/15.70
total/used/available memory (Gb): 10.00/2.99/7.01
recommended/actual fraction: 0.70/0.90


In [26]:
print(test_dataset)
print()
print(test_dataset[0]["input"])
print(test_dataset[0]["reference"])
print()
print(test_dataset[1]["input"])
print(test_dataset[1]["reference"])
print()
print(test_dataset[2]["input"])
print(test_dataset[2]["reference"])

Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 12
})

Fix grammar: The romance is just a small parts where Batman have to choose whether to be a hero or going with his girlfriend I believed other people would also loved this movie because its not only thrilling or exciting, it makes you feeling like you are in the movie experiencing what is went on across the movies.
The romance is just a small part where Batman has to choose whether to be a hero or go with his girlfriend. I believe other people would also love this movie because it's not only thrilling, it makes you feel like you are in the movie, experiencing what is going on in the movie.

Fix grammatical errors: Then, concerning a slight rising in 1992, it fell back about the initial levels at the end of 1994.
Then, after a slight rise in 1992, they fell back to the initial level at the end of 1994.

Fix grammatical mistakes in this sentence: Most of this were down to the speed following which o

In [16]:
training_model = model
print(type(training_model))
print(training_model.config)

total_params = sum(p.numel() for p in training_model.parameters())
total_trainable_params = sum(p.numel() for p in training_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")


rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")


def compute_metrics(eval_pred):
    print(f"> {eval_pred}")
    predictions, labels = eval_pred
    # print(predictions)
    # print(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print(decoded_preds)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_labels)

    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    sacreblue_score = sacreblue_metric.compute(predictions=decoded_preds, references=decoded_labels)
    # sari_score = sari_metric.compute(
    #     sources=processed_samples["input"],
    #     predictions=processed_samples["processed"],
    #     references=processed_samples["references"],
    # )
    em_score = em_metric.compute(predictions=decoded_preds, references=decoded_labels)

    report = {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "rougeLsum": rouge_score["rougeLsum"],
        "sacreblue": sacreblue_score["score"],
        # "sari.sari": 0,
        "em": em_score["exact_match"],
    }

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    report["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in report.items()}

<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "facebook/bart-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
   

In [12]:
%%capture
%load_ext tensorboard
%tensorboard --logdir "model-{training_model_path}-train/runs"

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

train_path = f"model-{training_model_path}-train"
print(f"train_path: {train_path}")

train_size = len(processed_dataset["train"])
batch_size = 1
gradient_accumulation_steps = 4

per_epoch_steps = train_size / (batch_size * gradient_accumulation_steps)
max_steps = 10
epochs = max_steps / per_epoch_steps
print(f"train_size: {train_size}, per_epoch_steps: {per_epoch_steps}, max_steps: {max_steps}, epochs: {epochs}")

# if max_steps < per_epoch_steps:
#     raise Exception(f"Training doesn't cover the entire training dataset with {train_size} samples")

epochs = 1
epoch_total_steps = epochs * per_epoch_steps
print(f"train_size: {train_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}")

args = Seq2SeqTrainingArguments(
    output_dir=train_path,
    learning_rate=2e-5,
    weight_decay=0.01,
    # warmup_ratio=0.05,
    # optim="paged_adamw_8bit",
    fp16=True,
    predict_with_generate=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=epochs,
    # max_steps=max_steps,
    # warmup_steps=2,
    logging_steps=10,
    save_strategy="steps",
    save_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    report_to="tensorboard",
)


trainer = Seq2SeqTrainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=args,
)
print(type(trainer))

model.config.use_cache = False
trainer.train()

train_path: model-iliazlobin_bart-large-coedit-train
train_size: 68, per_epoch_steps: 17.0, max_steps: 10, epochs: 0.5882352941176471
train_size: 68, per_epoch_steps: 17.0, epochs: 1, epoch_total_steps: 17.0
<class 'transformers.trainer_seq2seq.Seq2SeqTrainer'>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Sacreblue,Em,Gen Len
10,0.3922,0.432775,0.5873,0.4385,0.4985,0.4989,17.6701,0.0,16.3333




> <transformers.trainer_utils.EvalPrediction object at 0x7feb9418c370>
[[    2     0   100  2047    97    82    74    67  2638    42  1569   142
     63    45   129 16208    50  3571     6     2]
 [    2     0  4993    10  7019  2227    11  8548     6    24  1064   124
     59     5  2557  1389    23     5   253     2]
 [    2     0  2895     9    42    58   159     7     5  2078   511    61
     84  1074    32    38  1266    55   390     2]
 [    2     0  1779   576    13   765  5788     6   516   329 30912    16
     10  3487  1522 25465     4     2     2     2]
 [    2     0   673   128 14919  1224  2038    11 13025     4     2     2
      2     2     2     2     2     2     2     2]
 [    2     0 45714    10   357 44680  1732     9     5  3645    35    91
     16    10   411    12   958   404    12     2]
 [    2     0 39008   439     7  1429    18   256  4001   139 10394 18879
     11    10    86     9   112    35  4431     2]
 [    2     0   250 10638     8  2866  1465    13  169

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=17, training_loss=0.42854440913480873, metrics={'train_runtime': 15.1146, 'train_samples_per_second': 4.499, 'train_steps_per_second': 1.125, 'total_flos': 5671719075840.0, 'train_loss': 0.42854440913480873, 'epoch': 1.0})

In [15]:
print(type(trainer))
print(type(trainer.model))

trainer.save_model(f"model-{training_model_path}")
# trainer.push_to_hub(training_model_id, token=os.getenv("HUGGING_FACE_TOKEN"))
trained_model = trainer.model
model = trained_model

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


<class 'transformers.trainer_seq2seq.Seq2SeqTrainer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


## Test inference

In [None]:
max_batch = 2
max_length = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    outputs = model.generate(input.input_ids, max_length=max_length)
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {processed}")


>> gec
input: ['Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.', 'Fix grammaticality in this sentence: They are increasing rapidly in Japan for a couple of years.']
result: ['Despite the strict Japanese society, I feel happy when I have dinner with my family.', 'They have been increasing rapidly in Japan for a couple of years.']

>> coherence
input: ['Fix coherence: Outside the town, 6 tourists were reported killed. Official documents indicate that at least 255 local residents were killed, with a further 29 never found.', 'Make the text more cohesive: Whereas at some times (and in some places) a Corps of two divisions was sufficient, at other times 5 or 6 divisions were necessary. Under the Hindenburg regime (from summer 1916), new Corps headquarters were created without organic divisions.']
result: ['Outside the town, 6 tourists were reported killed. However, official documents indicate that at least 255 local resi