In [1]:
!source .env/bin/activate

In [2]:
#!pip install evaluate

In [3]:
#!pip install nltk

In [4]:
#!pip install rouge_score

This code will use the HuggingFace tutorial to fine-tune a model with a dataset. 

Tutorial: https://huggingface.co/course/chapter7/5?fw=pt

Step 1: Prepare the corpus for fine-tuning

In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('PoetryFoundationData.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,
...,...,...,...,...,...
13849,13,\r\r\n 1-800-FEAR\r\r\n ...,\r\r\nWe'd like to talk with you about ...,Jody Gladding,"Living,Social Commentaries,Popular Culture"
13850,14,\r\r\n The Death of Atahual...,\r\r\n\r\r\n,William Jay Smith,
13851,15,\r\r\n Poet's Wish\r\r\n ...,\r\r\n\r\r\n,William Jay Smith,
13852,0,\r\r\n 0\r\r\n,\r\r\n Philosophic\r\r\nin its comple...,Hailey Leithauser,"Arts & Sciences,Philosophy"


In [8]:
df.iloc[0]['Title']

'\r\r\n                    Objects Used to Prop Open a Window\r\r\n                '

In [9]:
df = df[['Title', 'Poem']]

In [10]:
df['Title'] = df['Title'].apply(lambda x: x.replace('\r\r\n', ' ').strip())
df['Poem'] = df['Poem'].apply(lambda x: x.replace('\r\r\n', ' ').strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: x.replace('\r\r\n', ' ').strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Poem'] = df['Poem'].apply(lambda x: x.replace('\r\r\n', ' ').strip())


In [11]:
df

Unnamed: 0,Title,Poem
0,Objects Used to Prop Open a Window,"Dog bone, stapler, cribbage board, garlic pres..."
1,The New Church,"The old cupola glinted above the clouds, shone..."
2,Look for Me,Look for me under the hood of that old Chevrol...
3,Wild Life,"Behind the silo, the Mother Rabbit hunches lik..."
4,Umbrella,When I push your button you fly off the handle...
...,...,...
13849,1-800-FEAR,We'd like to talk with you about fear t...
13850,The Death of Atahuallpa,
13851,Poet's Wish,
13852,0,"Philosophic in its complex, ovoid emptiness, a..."


In [12]:
df['Poem_len'] = df['Poem'].apply(lambda x: len(x))
df['Title_len'] = df['Title'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Poem_len'] = df['Poem'].apply(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title_len'] = df['Title'].apply(lambda x: len(x))


In [13]:
# remove all poems and titles which are too short or too long
df = df[df['Poem_len'] > 0]
df = df[df['Title_len'] > 0]
df = df[df['Poem_len'] < 10000]
df = df[df['Title_len'] < 100]

In [14]:
df

Unnamed: 0,Title,Poem,Poem_len,Title_len
0,Objects Used to Prop Open a Window,"Dog bone, stapler, cribbage board, garlic pres...",575,34
1,The New Church,"The old cupola glinted above the clouds, shone...",657,14
2,Look for Me,Look for me under the hood of that old Chevrol...,389,11
3,Wild Life,"Behind the silo, the Mother Rabbit hunches lik...",911,9
4,Umbrella,When I push your button you fly off the handle...,629,8
...,...,...,...,...
13835,!,"Dear Writers, I’m compiling the first in what ...",211,1
13848,1 January 1965,The Wise Men will unlearn your name. Above you...,785,14
13849,1-800-FEAR,We'd like to talk with you about fear t...,661,10
13852,0,"Philosophic in its complex, ovoid emptiness, a...",472,1


We're going to start with a dataset of just 1000 poem/title pairs for testing purposes. 

In [15]:
df = df.sample(1000)
df = df.reset_index(drop=True)

In [16]:
df

Unnamed: 0,Title,Poem,Poem_len,Title_len
0,On Mother's Day,On Mother's Day it isn't smart To give your mo...,446,15
1,The Education of a Poet,"Her pencil poised, she's ready to create, Then...",242,23
2,Populist,"I dreamed myself of their people, I am of thei...",1375,8
3,From “Anagrams” [xxi],One of Halberg’s more whimsical decisions: jus...,2009,21
4,Harold's Chicken Shack #1,"i was born by a lake, chicken shack, ...",785,25
...,...,...,...,...
995,[Long Neglect Has Worn Away],Long neglect has worn away\rHalf the sweet enc...,384,28
996,Subject and Object,"On “Sueño No. 5: Botella del mar,” a photograp...",1254,18
997,"Song: Sweetest love, I do not go","Sweetest love, I do not go, For weari...",1290,32
998,"Man In Boat, 1998",It’s unbearable to shadowdrift along the seabe...,1312,17


In [17]:
from datasets import Dataset

In [18]:
dataset = Dataset.from_pandas(df, split='validation')
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Poem', 'Poem_len', 'Title_len'],
        num_rows: 800
    })
    test: Dataset({
        features: ['Title', 'Poem', 'Poem_len', 'Title_len'],
        num_rows: 200
    })
})

Now that we have our dataset, we choose a pre-trained model and preprocess our data. 
The model I'll use is facebook/bart-base.
See paper for explanation and analysis of why I chose this model. 

In [20]:
from transformers import BartTokenizer, BartModel
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartModel.from_pretrained('facebook/bart-base')

In [21]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/rkohli/.huggingface/token
Login successful


In [22]:
# from transformers import AutoTokenizer
# 
# model_checkpoint = 'facebook/bart-base'
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [23]:
# Testing the tokenizer
inputs = tokenizer("This is a test to see if we can tokenize correctly.")
inputs

{'input_ids': [0, 713, 16, 10, 1296, 7, 192, 114, 52, 64, 19233, 2072, 12461, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['<s>',
 'This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġto',
 'Ġsee',
 'Ġif',
 'Ġwe',
 'Ġcan',
 'Ġtoken',
 'ize',
 'Ġcorrectly',
 '.',
 '</s>']

In [25]:
# Get the max tokens for titles and poems

max_poem = df.iloc[df['Poem_len'].idxmax()]['Poem']
max_title = df.iloc[df['Title_len'].idxmax()]['Title']

max_poem_length = len(tokenizer.convert_ids_to_tokens(tokenizer(max_poem, 
                                                                max_length=1024, 
                                                                truncation=True).input_ids))
max_title_length = len(tokenizer.convert_ids_to_tokens(tokenizer(max_title, 
                                                                max_length=1024, 
                                                                truncation=True).input_ids))

In [28]:
print("max poem tokens length: " + str(max_poem_length))
print("max title tokens length: " + str(max_title_length))

max poem tokens length: 1024
max title tokens length: 26


In [29]:
def preprocess_function(data):
    
    model_inputs = tokenizer(data["Poem"], max_length = max_poem_length, truncation=True)
    
    labels = tokenizer(text_target=data["Title"], max_length = max_title_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [30]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
# Here we set the arguments for the DataTrainer building off a Sequence to Sequence base Trainer

batch_size = 8
num_train_epochs = 8

# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = 'bart-base'

# arguments
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-poems",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

We now create a metric to evaluate the training => for text title generation the right metric is "Rouge"

In [32]:
# setup evaluation metric for training

import evaluate
import nltk
from nltk.tokenize import sent_tokenize # sentence tokenizer

metric = evaluate.load("rouge")
#nltk.download("punkt")

In [33]:
# functions to test the rouge computational metric

generated_title = "I absolutely loved reading the Hunger Games"
reference_title = "I loved reading the Hunger Games"

scores = metric.compute(predictions=[generated_title], references=[reference_title])

scores # this returns only the fmeasure (nothing else though I'm not sure why...)

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

We interperet the above rouge scores like this:
- rouge 1 is the ...

In [35]:
def one_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:1])

def evaluate_baseline(dataset, metric):
    summaries = [one_sentence_summary(text) for text in dataset["Poem"]]
    return metric.compute(predictions=summaries, references=dataset["Title"])

score = evaluate_baseline(dataset["train"], metric)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 8.34, 'rouge2': 4.47, 'rougeL': 8.04, 'rougeLsum': 8.04}

^ we interepret these as such:
- Firstly, the rouge2 score is much lower... (here's why: ??)

In [36]:
# This function offically computes the metrics of the predictions so we can calculate during the training

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    
    # Decode generated titles into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode reference titles into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [37]:
# This is the data collator to pad the inputs and outputs

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [38]:
# Testing the data collator

tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': tensor([[    0,   271, 25824,  ...,  1437,  1437,     2],
        [    0,   133,    86,  ...,  6219, 14193,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[    0,   133,  5463,  3697,     9,     5, 11253,     2],
        [    0,   250,  5205, 40689,     2,  -100,  -100,  -100]])}

In [39]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Now that we've fine-tuned our model, let's use it!

In [None]:
from transformers import pipeline

hub_model_id = "huggingface-course/bart-base-finetuned-poems"
summarizer = pipeline("summarization", model=hub_model_id)

In [None]:
summarizer('POEM this is a test poem..')

Testing the model here...