In [None]:
#!pip install evaluate

In [None]:
#!pip install nltk

In [None]:
#!pip install rouge_score

This code will use the HuggingFace tutorial to fine-tune a model with a dataset. 

Tutorial: https://huggingface.co/course/chapter7/5?fw=pt

Step 1: Prepare the corpus for fine-tuning

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('PoetryFoundationData.csv')

In [None]:
df

In [None]:
df.iloc[0]['Title']

In [None]:
df = df[['Title', 'Poem']]

In [None]:
df['Title'] = df['Title'].apply(lambda x: x.replace('\r\r\n', ' ').strip())
df['Poem'] = df['Poem'].apply(lambda x: x.replace('\r\r\n', ' ').strip())

In [None]:
df

In [None]:
df['Poem_len'] = df['Poem'].apply(lambda x: len(x))
df['Title_len'] = df['Title'].apply(lambda x: len(x))

In [None]:
# remove all poems and titles which are too short or too long
df = df[df['Poem_len'] > 0]
df = df[df['Title_len'] > 0]
df = df[df['Poem_len'] < 10000]
df = df[df['Title_len'] < 100]

In [None]:
df

We're going to start with a dataset of just 1000 poem/title pairs for testing purposes. 

In [None]:
df = df.sample(1000)
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
from datasets import Dataset

In [None]:
dataset = Dataset.from_pandas(df, split='validation')
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [None]:
dataset

Now that we have our dataset, we choose a pre-trained model and preprocess our data. 
The model I'll use is facebook/bart-base.
See paper for explanation and analysis of why I chose this model. 

In [None]:
from transformers import BartTokenizer, BartModel
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartModel.from_pretrained('facebook/bart-base')

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# from transformers import AutoTokenizer
# 
# model_checkpoint = 'facebook/bart-base'
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Testing the tokenizer
inputs = tokenizer("This is a test to see if we can tokenize correctly.")
inputs

In [None]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

In [None]:
# Get the max tokens for titles and poems

max_poem = df.iloc[df['Poem_len'].idxmax()]['Poem']
max_title = df.iloc[df['Title_len'].idxmax()]['Title']

max_poem_length = len(tokenizer.convert_ids_to_tokens(tokenizer(max_poem, 
                                                                max_length=1024, 
                                                                truncation=True).input_ids))
max_title_length = len(tokenizer.convert_ids_to_tokens(tokenizer(max_title, 
                                                                max_length=1024, 
                                                                truncation=True).input_ids))

In [None]:
print("max poem tokens length: " + str(max_poem_length))
print("max title tokens length: " + str(max_title_length))

In [None]:
def preprocess_function(data):
    
    model_inputs = tokenizer(data["Poem"], max_length = max_poem_length, truncation=True)
    
    # should the first param be noted at target_text
    labels = tokenizer(data["Title"], max_length = max_title_length, truncation=True)
    
    model_inputs["label"] = labels["input_ids"] #TODO: Check if this column should be "labels" or "label"
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
# Here we set the arguments for the DataTrainer building off a Sequence to Sequence base Trainer

batch_size = 8
num_train_epochs = 8

# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = 'bart-base'

# arguments
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-poems",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

We now create a metric to evaluate the training => for text title generation the right metric is "Rouge"

In [None]:
# setup evaluation metric for training

import evaluate
import nltk
from nltk.tokenize import sent_tokenize # sentence tokenizer

metric = evaluate.load("rouge")
nltk.download("punkt") # we need to download this for some reason to run the metric.compute function

In [None]:
# functions to test the rouge computational metric

generated_title = "I absolutely loved reading the Hunger Games"
reference_title = "I loved reading the Hunger Games"

scores = metric.compute(predictions=[generated_title], references=[reference_title])

scores # this returns only the fmeasure (nothing else though I'm not sure why...)

We interperet the above rouge scores like this:
- rouge 1 is the ...

In [None]:
def one_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:1])

def evaluate_baseline(dataset, metric):
    summaries = [one_sentence_summary(text) for text in dataset["Poem"]]
    return metric.compute(predictions=summaries, references=dataset["Title"])

score = evaluate_baseline(dataset["train"], metric)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

^ we interepret these as such:
- Firstly, the rouge2 score is much lower... (here's why: ??)

In [None]:
# This function offically computes the metrics of the predictions so we can calculate during the training

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    
    # Decode generated titles into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode reference titles into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# This is the data collator to pad the inputs and outputs

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Testing the data collator

tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

In [None]:
# observe the train_dataset

tokenized_datasets['train'].features

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Now that we've fine-tuned our model, let's use it!

In [None]:
from transformers import pipeline

hub_model_id = "huggingface-course/bart-base-finetuned-poems"
summarizer = pipeline("summarization", model=hub_model_id)

In [None]:
summarizer('POEM this is a test poem..')

Testing the model here...