<a href="https://colab.research.google.com/github/jantuitman/deeplearning/blob/main/Training_t5_nieuwe_versie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers;
!pip install torch;
!pip install sklearn;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://u

In [2]:
from transformers import  T5ForConditionalGeneration, Adafactor, T5Tokenizer, AutoTokenizer, PreTrainedModel
import torch
import time

import json

In [3]:
# read a json file containing an array of objects
# each object has a "question" and "answer" field
# the question is the input and the answer is the output
# we will return a list of tuples
def read_dataset(filename):
    with open(filename) as f:
        data = json.load(f)
        # data has the following structure:
        # data = {
        #     "paragraphs": [
        #         {
        #             "translated_context": "text",
        #             "translated_question": "text",
        #             "translated_answers": [
        #                 "text",
        #                 "text",
        #                 "text"
        #             ],
        #             "is_impossible": true
        #         },
        #         ...
        #     ]
        # }
        # if is_impossible is true, then translated_answers is empty and we want to return the answer "Onbekend"
        # if is_impossible is false, then translated_answers is not empty and we want to return the first answer
        # we will return a list of tuples
        # we need to make a question_text from the translated_context and translated_question
        result= []
        for paragraph in data['paragraphs']:
            question_text = f"Context: {paragraph['translated_context']}\n\nVraag: {paragraph['translated_question']}"
            answer_text = "Onbekend"
            if not paragraph['is_impossible']:
               answer_text = paragraph['translated_answers'][0]
            result.append((question_text, answer_text))
        return result

In [4]:
from torch.utils.data import TensorDataset

def my_collator(data):
    # Tokenize the input and output sequences
    input_text = [example[0] for example in data]
    target_text = [example[1] for example in data]
    input_ids = tokenizer.batch_encode_plus(input_text, return_tensors='pt', pad_to_max_length=True)['input_ids']
    attention_mask = tokenizer.batch_encode_plus(input_text, return_tensors='pt', pad_to_max_length=True)['attention_mask']
    labels = tokenizer.batch_encode_plus(target_text, return_tensors='pt', pad_to_max_length=True)['input_ids']

    return { "input_ids":torch.stack([input_id for input_id in input_ids]), 
            "attention_mask": torch.stack([a for a in attention_mask]),
            "labels": torch.stack([l for l in labels])}

In [5]:
from sklearn.model_selection import train_test_split

model_name='yhavinga/t5-base-dutch'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)




Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
data = read_dataset('drive/MyDrive/InputData/dev-nl.json')
train_data, val_data = train_test_split(data,test_size=0.1)

# Prepare the train dataset
#train_dataset = prepare_dataset(train_data,tokenizer)
#val_dataset = prepare_dataset(val_data,tokenizer)


In [7]:
from transformers import Trainer, TrainingArguments

# main output dir
main_output_dir = './drive/MyDrive/OutputModels/t5_new3'
# Configure the training arguments
training_args = TrainingArguments(
    output_dir=f'{main_output_dir}/results',
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_steps=1000,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir=f'{main_output_dir}/logs',
    logging_steps=100,
    load_best_model_at_end=True
)

In [9]:
# Create the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=my_collator
)

# Run the fine-tuning

trainer.train()

***** Running training *****
  Num examples = 10685
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 26715
  Number of trainable parameters = 222884352


Step,Training Loss,Validation Loss
1000,2.4635,1.651131
2000,1.6499,1.207619
3000,1.3365,1.161503
4000,1.1739,1.101207
5000,1.2157,1.025484
6000,1.1025,0.951561
7000,0.867,0.835903
8000,0.8991,0.753754
9000,0.9189,0.688938
10000,0.7865,0.686644


***** Running Evaluation *****
  Num examples = 1188
  Batch size = 2
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-1000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-1000/config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 2
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-2000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-2000/config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 2
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-3000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-3000/config.json
Model weights saved in ./drive/MyD

TrainOutput(global_step=26715, training_loss=0.8902457954953633, metrics={'train_runtime': 4893.5263, 'train_samples_per_second': 10.917, 'train_steps_per_second': 5.459, 'total_flos': 1.83803175459072e+16, 'train_loss': 0.8902457954953633, 'epoch': 5.0})

In [8]:
from transformers import Trainer, TrainingArguments
model = T5ForConditionalGeneration.from_pretrained('./drive/MyDrive/OutputModels/t5_new3/results/checkpoint-26000')
# main output dir
main_output_dir = './drive/MyDrive/OutputModels/t5_new4'
# Configure the training arguments
training_args = TrainingArguments(
    output_dir=f'{main_output_dir}/results',
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_steps=1000,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir=f'{main_output_dir}/logs',
    logging_steps=100,
    load_best_model_at_end=True
)

In [9]:
# Create the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=my_collator
)

# Run the fine-tuning

trainer.train()

***** Running training *****
  Num examples = 10685
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 53440
  Number of trainable parameters = 222884352


Step,Training Loss,Validation Loss
1000,0.361,0.394672
2000,0.4968,0.40128
3000,0.3676,0.404324
4000,0.3466,0.405856
5000,0.3324,0.414337
6000,0.2496,0.428481
7000,0.2767,0.431794
8000,0.3353,0.428004
9000,0.252,0.443006
10000,0.2472,0.439505


***** Running Evaluation *****
  Num examples = 1188
  Batch size = 4
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-1000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-1000/config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 4
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-2000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-2000/config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 4
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-3000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new4/results/checkpoint-3000/config.json
Model weights saved in ./drive/MyD

KeyboardInterrupt: ignored