In [None]:
"""
! pip install transformers[torch]
! pip install accelerate -U
! pip install accelerate>=0.20.1
! pip install datasets
! pip install transformers
! pip install sacrebleu
"""

In [1]:
!git clone https://github.com/ivancheroleg/Text-de-toxification-PMLDL-IU

Archive:  /content/dataset.zip
replace dataset/dataset_dict.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [3]:
from datasets import load_from_disk

# load dataset from local file
dataset = load_from_disk("/content/Text-de-toxification-PMLDL-IU/data/interim/dataset")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 519999
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 28888
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 28888
    })
})

In [43]:
model_checkpoint = "t5-base"

from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [44]:
prefix = "translate English to Deutsch:"

max_input_length = 128
max_target_length = 128
source_lang = "toxic"
target_lang = "non-toxic"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_function(dataset["train"][:2])

{'input_ids': [[13959, 1566, 12, 3, 18609, 10, 99, 901, 4031, 8347, 7, 160, 28, 160, 2550, 2670, 6, 34, 133, 3209, 8, 306, 1425, 13, 6567, 7031, 1538, 449, 5, 1], [13959, 1566, 12, 3, 18609, 10, 4188, 31, 60, 2852, 27635, 53, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[156, 901, 4031, 19, 18368, 160, 28, 26829, 2670, 6, 24, 3, 9453, 8, 306, 593, 13, 6567, 7031, 1538, 4849, 5, 1], [852, 25, 31, 60, 652, 23147, 5, 1]]}

In [45]:
from datasets import load_dataset, load_metric

metric = load_metric("sacrebleu")

In [46]:
# for the example purpose we will crop the dataset and select first 5000 for train
# and 500 for validation and test
cropped_datasets = dataset
cropped_datasets['train'] = dataset['train'].select(range(5000))
cropped_datasets['validation'] = dataset['validation'].select(range(500))
cropped_datasets['test'] = dataset['test'].select(range(500))
tokenized_datasets = cropped_datasets.map(preprocess_function, batched=True)
#tokenized_datasets['train'][0]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'translation': {'non-toxic': 'If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.',
  'toxic': 'if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.'},
 'input_ids': [13959,
  1566,
  12,
  3,
  18609,
  10,
  99,
  901,
  4031,
  8347,
  7,
  160,
  28,
  160,
  2550,
  2670,
  6,
  34,
  133,
  3209,
  8,
  306,
  1425,
  13,
  6567,
  7031,
  1538,
  449,
  5,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [156,
  901,
  4031,
  19,
  18368,
  160,
  28,
  26829,
  2670,
  6,
  24,
  3,
  9453,
  8,
  306,
  593,
  13,
  6567,
  7031,
  1538,
  4849,
  5,
  1]}

In [47]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [48]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)

In [49]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# simliarly it implements the batch creation for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [50]:
import numpy as np

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [53]:
import torch
torch.cuda.empty_cache()

In [56]:
import gc

gc.collect()

1431

In [57]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [58]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,3.090088,7.861,13.126
2,No log,2.974567,10.624,12.584
3,No log,2.824568,13.6892,13.31
4,2.859400,2.81641,14.2646,13.2
5,2.859400,2.820078,14.4976,12.622
6,2.859400,2.882862,15.4441,13.284
7,1.874900,3.034338,13.5971,12.914




Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,3.090088,7.861,13.126
2,No log,2.974567,10.624,12.584
3,No log,2.824568,13.6892,13.31
4,2.859400,2.81641,14.2646,13.2
5,2.859400,2.820078,14.4976,12.622
6,2.859400,2.882862,15.4441,13.284
7,1.874900,3.034338,13.5971,12.914
8,1.874900,3.228065,13.529,12.458
9,1.874900,3.452247,14.5028,13.144
10,0.988500,3.785875,14.0646,12.986




TrainOutput(global_step=1570, training_loss=1.8475923793331073, metrics={'train_runtime': 669.7545, 'train_samples_per_second': 74.654, 'train_steps_per_second': 2.344, 'total_flos': 3061269375713280.0, 'train_loss': 1.8475923793331073, 'epoch': 10.0})

In [59]:
# saving model
trainer.save_model('best')

In [60]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('best')
model.eval()
model.config.use_cache = False

In [61]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [62]:
print(dataset['train'][0])

{'translation': {'non-toxic': 'If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.', 'toxic': 'if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.'}}


In [79]:
text = 'if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.'
translate(model, text, tokenizer=tokenizer)

If Alkar is flooding her with psychic waste, it would explain the high level of neurotrans


In [77]:
cropped_datasets['test'][i]['translation']['toxic']

'"I\'d like to put in the hands of the heretic.'

In [74]:
for i in range(1, cropped_datasets['test'].shape[0]):
    translated = translate(model, cropped_datasets['test'][i]['translation']['toxic'], tokenizer=tokenizer)
    print(translated, i)
    metric.add_batch(predictions=translated, references=cropped_datasets['test'][i]['translation']['non-toxic'])

"It like to put the hand in the hands of the heretic.
None 1


TypeError: ignored