<a href="https://colab.research.google.com/github/ilsilfverskiold/transformers-nlp-docs/blob/main/cook/fine-tune/fine_tune_encoder_huggingface_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U huggingface_hub

In [None]:
from datasets import load_dataset

# import dataset from hugging face (it has two fields I'm interested in, 'text' and 'keywords')
dataset = load_dataset("sunhaozhepy/ag_news_keywords_embeddings")
# check the dataset
dataset

In [None]:
# create a validate set (as it is missing in this dataset)
# standard practice seems to be around 70-80% training, 10-20% validation and 10-20% testing but this is a larger dataset so we can keep the sets smaller
from datasets import DatasetDict

# remember that the test and validation sets should be unique so we're grabbing data only from the training set to build the validation set
shuffled_training_set = dataset['train'].shuffle(seed=42)
validation_set = shuffled_training_set.select(range(7600))

new_training_set = shuffled_training_set.select(range(7600, len(shuffled_training_set)))

new_dataset = DatasetDict({
    'train': new_training_set,
    'validation': validation_set,
    'test': dataset['test']  # unchanged
})

new_dataset

In [None]:
# map out some examples from the dataset
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['text']}'")
        print(f"'>> Keywords: {example['keywords']}'")


show_samples(new_dataset)

In [None]:
# (optional)
# might be good to graph the distribution of length for your fields and filter out any outliers here
# use matplotlib

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'facebook/bart-large'
# get the tokenizer from Hugging Face based on the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

texts = new_dataset['train']['text']

# tokenize all texts and find the maximum length of the texts - max token length for BART seems to be 1024 tokens
max_token_length = max(len(tokenizer.encode(text, truncation=True)) for text in texts)
print(f"The longest text is {max_token_length} tokens long.")
# if it is longer than 1024 you'll need to filter or truncuate the texts in some way

In [None]:
# (optional) create a function that filters out any rows with more than 5 keywords
# this is merely to make sure we train it on the data and results we want (3 - 5 keywords)
def filter_keywords(example):
    return len(example['keywords'].split(', ')) <= 5

new_dataset = new_dataset.filter(filter_keywords)

new_dataset

In [None]:
# create a function that will convert both the input text and the target text into a format suitable for training a sequence-to-sequence model
# remember data preprocessing functions would look different if you were using a model with a different architecture, such as an encoder-only or decoder-only model.
def get_feature(batch):
  encodings = tokenizer(batch['text'], text_target=batch['keywords'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings


In [None]:
# set the tokens for the entire dataset using the get_feature function
dataset_pt = new_dataset.map(get_feature, batched=True)

# if we log this now it should show us a few more fields that are necessary for training the model
dataset_pt

In [None]:
# the dataset should be formatted as PyTorch tensors with only the new fields
# i.e. specifies which columns should be returned when accessing the data - only the new fields will be returned
columns = ['input_ids', 'labels', 'attention_mask']
dataset_pt.set_format(type='torch', columns=columns)

dataset_pt

In [None]:
# the data collator is responsible for dynamically padding the batches to the maximum length in each batch.
# which is crucial for efficient training of transformer models like BART or T5.
# padding will look different depending on the type of model you use, you can see here that this one is specifically for seq-to-seq

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

# using the Trainer API which abstracts away a lot of complexity
training_args = TrainingArguments(
    output_dir = 'bart_keywords',
    num_train_epochs=3, # your choice
    warmup_steps = 500,
    per_device_train_batch_size=8, # keep a small batch size when working with a small GPU - if working with T4 set this to 4
    per_device_eval_batch_size=8, # If working with T4 set this to 4
    weight_decay = 0.01, # helps prevent overfitting
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=500, # base this on the size of your dataset and number of training epochs (we're using a large dataset here)
    save_steps=1e6,
    gradient_accumulation_steps=16 # running this on a small GPU
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_pt['train'], eval_dataset = dataset_pt['validation'])

trainer.train() # may take 1-4 hours depending on horsepower (GPU) and size of model

# note: trainer loss should go down, the validation loss may fluctuate for each evaluation step but consistently increasing validation, while training is going down could be a sign of overfitting.

In [None]:
# save the model
# make sure you set the name of the model you want here
trainer.save_model('bart_keywords_model')

In [None]:
# test the model with the test set
from transformers import pipeline

pipe = pipeline('summarization', model='bart_keywords_model')

text = new_dataset['test'][0]['text']
keywords = new_dataset['test'][0]['keywords']

print(text_test)
print(pipe(text_test))
print(keywords)

# you can iterate over several examples from the test set to see how it is doing with new data

In [None]:
# if you're satisfied we can push it to Hugging Face
# you'll need a token from your Hugging Face account to log in
!huggingface-cli login

In [None]:
# you would replace your own username here
# you do not need to create a repository beforehand
trainer.push_to_hub("ilsilfverskiold/bart_keywords")