<a href="https://colab.research.google.com/github/jantuitman/deeplearning/blob/main/nog_een_trainingsessie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers;
!pip install torch;
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import  T5ForConditionalGeneration, Adafactor, T5Tokenizer, AutoTokenizer, PreTrainedModel
import torch
import time

import json

In [None]:
# read a json file containing an array of objects
# each object has a "question" and "answer" field
# the question is the input and the answer is the output
# we will return a list of tuples
def read_dataset2(filename):
    with open(filename) as f:
        data = json.load(f)
        # data has the following structure:
        # data = {
        #     "paragraphs": [
        #         {
        #             "translated_context": "text",
        #             "translated_question": "text",
        #             "translated_answers": [
        #                 "text",
        #                 "text",
        #                 "text"
        #             ],
        #             "is_impossible": true
        #         },
        #         ...
        #     ]
        # }
        # if is_impossible is true, then translated_answers is empty and we want to return the answer "Onbekend"
        # if is_impossible is false, then translated_answers is not empty and we want to return the first answer
        # we will return a list of tuples
        # we need to make a question_text from the translated_context and translated_question
        result= []
        previous_data = []
        previous_paragraph_text = ''
        for paragraph in data['paragraphs']:
            if previous_paragraph_text != paragraph['translated_context']:
              # we have a new context.
              # store previous context.
              if len(previous_data) >0:
                context_text = f"Extraheer vragen van context: {paragraph['translated_context']}"
                result.append((context_text,'\n\n'.join(previous_data)))
                previous_data = []
            # parse current context into previous_data array.
            previous_paragraph_text = paragraph['translated_context']    
            if not paragraph['is_impossible']:
              question_text = paragraph['translated_question']
              previous_data.append( question_text)
        return result

In [None]:
# read a json file containing an array of objects
# each object has a "question" and "answer" field
# the question is the input and the answer is the output
# we will return a list of tuples
def read_dataset(filename):
    with open(filename) as f:
        data = json.load(f)
        # data has the following structure:
        # data = {
        #     "paragraphs": [
        #         {
        #             "translated_context": "text",
        #             "translated_question": "text",
        #             "translated_answers": [
        #                 "text",
        #                 "text",
        #                 "text"
        #             ],
        #             "is_impossible": true
        #         },
        #         ...
        #     ]
        # }
        # if is_impossible is true, then translated_answers is empty and we want to return the answer "Onbekend"
        # if is_impossible is false, then translated_answers is not empty and we want to return the first answer
        # we will return a list of tuples
        # we need to make a question_text from the translated_context and translated_question
        result= []
        for paragraph in data['paragraphs']:
            question_text = f"Context: {paragraph['translated_context']}\n\nVraag: {paragraph['translated_question']}"
            answer_text = "Onbekend"
            if not paragraph['is_impossible']:
               answer_text = paragraph['translated_answers'][0]
            result.append((question_text, answer_text))
        return result

In [None]:
from torch.utils.data import TensorDataset

def my_collator(data):
    # Tokenize the input and output sequences
    input_text = [example[0] for example in data]
    target_text = [example[1] for example in data]
    input_ids = tokenizer.batch_encode_plus(input_text, return_tensors='pt', pad_to_max_length=True)['input_ids']
    attention_mask = tokenizer.batch_encode_plus(input_text, return_tensors='pt', pad_to_max_length=True)['attention_mask']
    labels = tokenizer.batch_encode_plus(target_text, return_tensors='pt', pad_to_max_length=True)['input_ids']

    return { "input_ids":torch.stack([input_id for input_id in input_ids]), 
            "attention_mask": torch.stack([a for a in attention_mask]),
            "labels": torch.stack([l for l in labels])}

In [None]:
from sklearn.model_selection import train_test_split

model_name='yhavinga/t5-base-dutch'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
data = read_dataset('drive/MyDrive/InputData/dev-nl.json')
train_data, val_data = train_test_split(data,test_size=0.1)
print(train_data[0])
print(train_data[1])

('Context: In zijn uitvoerig verslag schreef Céloron: "Alles wat ik kan zeggen is dat de inboorlingen van deze plaatsen zeer slecht staan tegenover de Fransen, en zich geheel aan de Engelsen wijden. Ik weet niet hoe ze teruggebracht kunnen worden." Nog voor zijn terugkeer naar Montreal kwamen er berichten over de situatie in de Ohio Country naar Londen en Parijs, waarbij elke partij voorstelde actie te ondernemen. William Shirley, de expansionistische gouverneur van de provincie Massachusetts Bay, was bijzonder krachtig en verklaarde dat de Britse kolonisten niet veilig zouden zijn zolang de Fransen aanwezig waren. De conflicten tussen de koloniën, die werden uitgevochten door plunderende partijen met Indiaanse bondgenoten, hadden al tientallen jaren geduurd, wat leidde tot een levendige handel in Europese koloniale gevangenen van beide kanten.\n\nVraag: Wat vond Celeron van de relaties met de Indianen?', 'zeer slecht gezind tegenover de Fransen, en zijn volledig toegewijd aan de Engel

In [None]:
from transformers.optimization import AdafactorSchedule
adafactor = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)
# adafactor = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3,clip_threshold=1.0)
lr_scheduler = AdafactorSchedule(adafactor)

In [None]:
from transformers import Trainer, TrainingArguments

# main output dir
main_output_dir = './drive/MyDrive/OutputModels/t5_new5'
# Configure the training arguments
training_args = TrainingArguments(
    output_dir=f'{main_output_dir}/results',
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    logging_dir=f'{main_output_dir}/logs',
    logging_steps=100,
    load_best_model_at_end=True
)

In [None]:
# Create the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=my_collator,
    
)

# Run the fine-tuning

trainer.train()

***** Running training *****
  Num examples = 10685
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 26720
  Number of trainable parameters = 222884352


Step,Training Loss,Validation Loss
1000,0.7335,0.65171
2000,0.5358,0.626338
3000,0.4094,0.586958
4000,0.4749,0.593788
5000,0.3926,0.593623
6000,0.2821,0.628109
7000,0.2339,0.63264
8000,0.2811,0.58085
9000,0.2236,0.633897
10000,0.163,0.677743


***** Running Evaluation *****
  Num examples = 1188
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-1000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-1000/config.json
Configuration saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-1000/generation_config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-2000
Configuration saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-2000/config.json
Configuration saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-2000/generation_config.json
Model weights saved in ./drive/MyDrive/OutputModels/t5_new5/results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1188
  Batch size = 8
Savi

TrainOutput(global_step=26720, training_loss=0.18720045637027055, metrics={'train_runtime': 6752.9436, 'train_samples_per_second': 31.645, 'train_steps_per_second': 3.957, 'total_flos': 1.023546532105728e+17, 'train_loss': 0.18720045637027055, 'epoch': 20.0})