In [2]:
"""Script for fine-tuning Pegasus with XSUM dataset

   adapted from https://towardsdatascience.com/how-to-perform-abstractive-summarization-with-pegasus-3dd74e48bafb

"""
# https://stackoverflow.com/questions/65854722/huggingface-albert-tokenizer-nonetype-error-with-colab
!pip install sentencepiece  # this has to be installed before transformer
!pip install transformers
!pip install datasets  # install huggingface datasets

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 6.1MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 5.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 60.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch

In [4]:
torch.__version__

'1.7.1+cu101'

In [5]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [6]:
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

In [7]:
def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      adafactor=True,                   # use adafactor instead of AdamW
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=2,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=2,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset             # evaluation dataset
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      adafactor=True,                  # use adafactor instead of AdamW
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=2,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
    )

  return trainer

In [8]:
# use XSum dataset as example, with first 1000 docs as training data
from datasets import load_dataset
dataset = load_dataset("xsum")
# train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
train_texts, train_labels = dataset['train']['document'], dataset['train']['summary']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1966.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=954.0, style=ProgressStyle(description_…

Using custom data configuration default



Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/f9abaabb5e2b2a1e765c25417264722d31877b34ec34b437c53242f6e5c30d6d...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254582292.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1001503.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/f9abaabb5e2b2a1e765c25417264722d31877b34ec34b437c53242f6e5c30d6d. Subsequent calls will reuse this data.


In [10]:
len(train_texts)

204045

In [9]:
# use Pegasus Large model as base for fine-tuning
# the following takes about 45 minutes using Colab Pro with GPU
# ~ 15.41G GPU and 2.6G RAM used
# TPU does not work for unknown reason - just crash
# increase the batch size to 32 - crash
# full xsum dataset - crash - switch to high RAM instance
%%time
model_name = 'google/pegasus-large'

# about 8 mins to finish for the whole dataset
train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=88.0, style=ProgressStyle(description_w…


CPU times: user 7min 20s, sys: 3.49 s, total: 7min 24s
Wall time: 7min 30s


In [11]:
%%time
# only about 1 minute
trainer = prepare_fine_tuning(model_name, train_dataset)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2866.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275327883.0, style=ProgressStyle(descr…


CPU times: user 1min 11s, sys: 8.88 s, total: 1min 20s
Wall time: 3min 9s


In [12]:
%%time
# all XSUM samples with batch size 2, 2000 epochs ~40 minutes
trainer.train()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))


Step,Training Loss
10,11.0286
20,11.1859
30,11.0086
40,10.9032
50,10.6235
60,10.572
70,10.2687
80,10.2417
90,10.1401
100,9.64


CPU times: user 25min 9s, sys: 10min 39s, total: 35min 49s
Wall time: 36min 1s


TrainOutput(global_step=2000, training_loss=1.5598750679455697, metrics={'train_runtime': 2161.4197, 'train_samples_per_second': 0.925, 'total_flos': 14027908448256000, 'epoch': 2000.0})

In [13]:
!ls results/

checkpoint-1000  checkpoint-1500  checkpoint-2000  checkpoint-500


In [14]:
!ls results/checkpoint-2000/

"""
config.json   pytorch_model.bin  trainer_state.json
optimizer.pt  scheduler.pt	 training_args.bin
"""

config.json   pytorch_model.bin  trainer_state.json
optimizer.pt  scheduler.pt	 training_args.bin


'\nconfig.json   pytorch_model.bin  trainer_state.json\noptimizer.pt  scheduler.pt\t training_args.bin\n'

In [16]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')

In [23]:
# load the finetuned model
model_path = "./results/checkpoint-2000/"
model_finetune = PegasusForConditionalGeneration.from_pretrained(model_path, local_files_only=True).to(torch_device)

In [20]:
# load the pegasus large model
model_large = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large').to(torch_device)

In [21]:
# load the pegasus xsum model
model_xsum = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum').to(torch_device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275329241.0, style=ProgressStyle(descr…




In [31]:
def show_result(src_text, model):
  batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
  translated = model.generate(**batch)
  gen_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return gen_text

In [34]:
# after fine tune xsum dataset the result is same as large 
# but the result from official xsum fine tuned model is different
text_1 = [
    """ 
    House Speaker Nancy Pelosi, clutching a miniature American flag on the 
    steps of the Capitol, agreed. “This reminds me of what it must have felt 
    like at Valley Forge,” the San Francisco Democrat said. 
    “Everything is at stake.” It’s not much of an exaggeration. 
    Pelosi and her fellow representatives stood at the scene of the violent 
    insurrection that delayed Congress’ certification of President 
    Biden’s election just two months ago, while further security threats 
    canceled a House session scheduled for Thursday. Though the mob that 
    stormed the capital on Jan. 6 literally called for Mike Pence’s neck, 
    the former vice president typified the party’s stance by embracing the 
    false claims of election fraud that fueled the violence in an 
    op-ed published Wednesday, joining former President Donald Trump in 
    attacking HR1. Republican-run state legislatures, meanwhile, 
    already have begun moving bills to limit ballot access and 
    suppress participation. The party can also be counted on to oust 
    several members of the House’s slim Democratic majority in 
    two years not by persuading voters but through nakedly partisan 
    gerrymandering, which already gives Republicans an advantage over 
    their vote share of as many as 22 seats, or twice the current 
    Democratic edge, according to an Associated Press analysis. 
    A restoration of Republican control of Congress could in turn 
    mean that the next attempt to overturn certification of 
    a legitimate presidential election succeeds.  
    """
]

text_2 = [
    """ 
    Prior studies of social-norms interventions have focused on 
    nudging behavior in noncompetitive settings. This research evaluates 
    such interventions in competitive environments, for example, a class 
    with a competitive grading policy. Field experiments on a 
    learning management system show that providing descriptive 
    information about peers’ behavior has mixed effects in 
    reducing procrastination and improving performance outcomes. 
    Specifically, the effects are moderated by individual characteristics 
    and contextual variables. 
    First, peer information interventions are more effective for males, 
    and the effects are stronger in a male-majority environment than 
    in a female-majority environment. 
    These findings differ from prior studies of social-norms interventions 
    conducted in noncompetitive settings, 
    in which females are found to be more responsive to interventions. 
    Gender differences in our competitive settings can be explained 
    by males’ and females’ differential preferences for competition: 
    males are more competitive-oriented and thus are more responsive to 
    peer information in competitive environments. 
    Second, we find that individuals who are in great need of interventions, 
    that is, those with poor past behavior and performance, 
    are also more likely to benefit from peer information interventions, 
    suggesting that peer information interventions motivate positive change. 
    This study highlights the heterogeneous effects of peer information 
    interventions and has implications for targeted interventions.
    """
]


print('my fine-tuned model result 1:', show_result(text_1, model_finetune))
print('official large model result 1:', show_result(text_1, model_large))
print('official xsum model result 1:', show_result(text_1, model_xsum))

print('my fine-tuned model result 2:', show_result(text_2, model_finetune))
print('official large model result 2:', show_result(text_2, model_large))
print('official xsum model result 2:', show_result(text_2, model_xsum))

my fine-tuned model result 1: ['A suspicious vote that has been declared unlawful but not so tainted that it should be called an ally in an upcoming elections probably isn’t a done deal:']
official large model result 1: ['Pelosi and her fellow representatives stood at the scene of the violent insurrection that delayed Congress’ certification of President Biden’s election just two months ago, while further security threats canceled a House session scheduled for Thursday.']
official xsum model result 1: ['“This is like the Civil War,” Senate Majority Leader Mitch McConnell, R-Ky., said Wednesday as he introduced the Republican-backed HR1.']
my fine-tuned model result 2: ['Gender differences in our competitive settings can be explained by males’ and females’ differential preferences for competition: males are more competitive-oriented and thus are more responsive to peer information in competitive environments.']
official large model result 2: ['Gender differences in our competitive setti