<a href="https://colab.research.google.com/github/ilsilfverskiold/transformers-nlp-docs/blob/main/cook/fine-tune/fine_tune_seqtoseq_tech_keywords_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dependencies
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U huggingface_hub

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     

In [2]:
from datasets import load_dataset

# import dataset from hugging face (it has two fields I'm interested in, 'text' and 'keywords')
dataset = load_dataset("ilsilfverskiold/tech-keywords-topics-summary")
# check the dataset
dataset

Downloading readme:   0%|          | 0.00/888 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7196 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/635 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/635 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 7196
    })
    validation: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 635
    })
    test: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 635
    })
})

In [3]:
# map out some examples from the dataset
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['text']}'")
        print(f"'>> Keywords: {example['keywords']}'")


show_samples(dataset)


'>> Text: Driverless car users will not be prosecuted for fatal crashes in UK'
'>> Keywords: Driverless Cars, Legal Issues, UK'

'>> Text: Google is embedding inaudible watermarks right into its AI generated music -'
'>> Keywords: Google, AI Music, Watermarks, Audio Technology'

'>> Text: What are your thoughts on Nextjs performance? Do you agree with this chart? - ( by 10up where Nextjs appears lower than WordPress on core vitals. Couldn’t post the image here due to community rules. But appreciate any other studies and thought you have on this matter.'
'>> Keywords: Next.js, Performance, 10up, WordPress'


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# set the correct model you'll be fine-tuning (see seq-to-seq model docs for more information)
model_name = 'facebook/bart-large'
# get the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# check the token length of the keywords field - you can do this for both fields
texts = dataset['train']['keywords']

# Tokenize all texts and find the maximum length (max for BART is 1024 tokens)
max_token_length = max(len(tokenizer.encode(text, truncation=True)) for text in texts)
print(f"The longest text is {max_token_length} tokens long.")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

The longest text is 96 tokens long.


In [7]:
# convert both the input text and the target text into a format suitable for training a sequence-to-sequence model
# remember data preprocessing functions would look different if you were using a model with a different architecture, such as an encoder-only or decoder-only model.

def get_feature(batch):
  encodings = tokenizer(batch['text'], text_target=batch['keywords'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings

dataset_pt = dataset.map(get_feature, batched=True)
dataset_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7196
    })
    validation: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 635
    })
    test: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 635
    })
})

In [9]:
# the dataset should be formatted as PyTorch tensors with only the new fields
# i.e. specifies which columns should be returned when accessing the data - only the new fields will be returned
columns = ['input_ids', 'labels', 'attention_mask']
dataset_pt.set_format(type='torch', columns=columns)

In [10]:
# the data collator is responsible for dynamically padding the batches to the maximum length in each batch.
# which is crucial for efficient training of transformer models like BART or T5.
# padding will look different depending on the type of model you use, you can see here that this one is specifically for seq-to-seq

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

# start training the model
# we're using the Trainer API which abstracts away a lot of complexity
training_args = TrainingArguments(
    output_dir = 'bart_tech_keywords',
    num_train_epochs=3, # your choice
    warmup_steps = 500,
    per_device_train_batch_size=4, # keep a small batch size when working with a small GPU
    per_device_eval_batch_size=4,
    weight_decay = 0.01, # helps prevent overfitting
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=50, # base this on the size of your dataset and number of training epochs
    save_steps=1e6,
    gradient_accumulation_steps=16 # running this on a small GPU
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_pt['train'], eval_dataset = dataset_pt['validation'])

trainer.train()

In [None]:
# save the model
trainer.save_model('tech-keywords-extractor') # set the name you want it to be called

In [None]:
from transformers import pipeline

# test the model using Hugging Face's pipeline
pipe = pipeline('summarization', model='tech-keywords-extractor')

# test the first item in the test set to see how it does
test_text=dataset_dict['test'][0]['text']
keywords = dataset_dict['test'][0]['keywords']
print("the text: ", text_test)
print("generated keywords: ", pipe(test_text))
print("orginal keywords : ",keywords)

In [None]:
# iterate over the test set to generate 50 examples at once
for i in range(0, 50):
    text_test = dataset_dict['test'][i]['text']
    keywords = dataset_dict['test'][i]['keywords']
    print("text: ", text_test)
    print("generated keywords: ", pipe(text_test)[0]['summary_text'])
    print("original keywords: ", keywords)

In [None]:
# if you're satisfied we can push it to Hugging Face
# you'll need a token from your Hugging Face account to log in
!huggingface-cli login

In [None]:
# you would replace your own name here
# you do not need to create a repository beforehand
trainer.push_to_hub("ilsilfverskiold/tech-keywords-extractor")