<a href="https://colab.research.google.com/github/ilsilfverskiold/transformers-nlp-docs/blob/main/cook/fine-tune/fine_tune_seqtoseq_tech_keywords_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medium Tutorial - Fine-Tune a Seq2Seq Model for Keyword Extraction
To follow the tutorial please go [here.](https://medium.com/gitconnected/fine-tune-smaller-nlp-models-with-hugging-face-for-specific-use-cases-1745813471dc). This script will create a keyword extractor using BART and the final model you'll find [here.](https://huggingface.co/ilsilfverskiold/tech-keywords-extractor). To understand what the script is doing see the [full tutorial.](https://medium.com/gitconnected/fine-tune-smaller-nlp-models-with-hugging-face-for-specific-use-cases-1745813471dc)

Make sure you set your runtime to T4 or better before running the script and always look out for overfitting.

In [None]:
# install dependencies
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U huggingface_hub

In [None]:
from datasets import load_dataset

# import dataset from hugging face (it has two fields I'm interested in, 'text' and 'keywords')
dataset = load_dataset("ilsilfverskiold/tech-keywords-topics-summary")
# check the dataset
dataset

In [None]:
# map out some examples from the dataset
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['text']}'")
        print(f"'>> Keywords: {example['keywords']}'")


show_samples(dataset)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# set the correct model you'll be fine-tuning
model_name = 'facebook/bart-large'
# get the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# check the token length of the keywords field - you can do this for both fields
texts = dataset['train']['keywords']

# Tokenize all texts and find the maximum length (max for BART is 1024 tokens)
max_token_length = max(len(tokenizer.encode(text, truncation=True)) for text in texts)
print(f"The longest text is {max_token_length} tokens long.")

In [None]:
# convert both the input text and the target text into a format suitable for training a sequence-to-sequence model
# remember data preprocessing functions would look different if you were using a model with a different architecture, such as an encoder-only or decoder-only model.

def get_feature(batch):
  encodings = tokenizer(batch['text'], text_target=batch['keywords'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings

dataset_pt = dataset.map(get_feature, batched=True)
dataset_pt

In [None]:
# the dataset should be formatted as PyTorch tensors with only the new fields
# i.e. specifies which columns should be returned when accessing the data - only the new fields will be returned
columns = ['input_ids', 'labels', 'attention_mask']
dataset_pt.set_format(type='torch', columns=columns)

In [None]:
# the data collator is responsible for dynamically padding the batches to the maximum length in each batch.
# which is crucial for efficient training of transformer models like BART or T5.
# padding will look different depending on the type of model you use, you can see here that this one is specifically for seq-to-seq

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

# start training the model
# we're using the Trainer API which abstracts away a lot of complexity
training_args = TrainingArguments(
    output_dir = 'bart_tech_keywords',
    num_train_epochs=3, # your choice
    warmup_steps = 500,
    per_device_train_batch_size=4, # keep a small batch size when working with a small GPU
    per_device_eval_batch_size=4,
    weight_decay = 0.01, # helps prevent overfitting
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=50, # base this on the size of your dataset and number of training epochs
    save_steps=1e6,
    gradient_accumulation_steps=16 # running this on a small GPU
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_pt['train'], eval_dataset = dataset_pt['validation'])

trainer.train()

In [None]:
# save the model
trainer.save_model('tech-keywords-extractor') # set the name you want it to be called

In [None]:
from transformers import pipeline

# test the model using Hugging Face's pipeline
pipe = pipeline('summarization', model='tech-keywords-extractor')

# test the first item in the test set to see how it does
test_text=dataset_dict['test'][0]['text']
keywords = dataset_dict['test'][0]['keywords']
print("the text: ", text_test)
print("generated keywords: ", pipe(test_text))
print("orginal keywords : ",keywords)

In [None]:
# iterate over the test set to generate 50 examples at once
for i in range(0, 50):
    text_test = dataset_dict['test'][i]['text']
    keywords = dataset_dict['test'][i]['keywords']
    print("text: ", text_test)
    print("generated keywords: ", pipe(text_test)[0]['summary_text'])
    print("original keywords: ", keywords)

In [None]:
# if you're satisfied we can push it to Hugging Face
# you'll need a token from your Hugging Face account to log in
!huggingface-cli login

In [None]:
# you would replace your own name here
# you do not need to create a repository beforehand
trainer.push_to_hub("ilsilfverskiold/tech-keywords-extractor")