### Script to finetune pegasus model
source https://gist.github.com/jiahao87/50cec29725824da7ff6dd9314b53c4b3

add the path to the fine tuning data excel to filename variable

In [8]:
import pandas as pd

filename = "/Users/lakke21/Downloads/summarization-aacl/abstractive/Legal-Pegasus/exceldataset.xlsx"

df = pd.read_excel(filename)

#df.rename(columns = {'data':'source', 'summary':'target'}, inplace = True)
len(df)

100

In [9]:
df.head()

Unnamed: 0,source,target
0,Special Leave Petition Nos.\n823 24 of 1990.\n...,Petitioners ' lands were acquired by the respo...
1,ivil Appeal No. 4649 of 1989.\nFrom the Judgme...,Pursuant to a scheme enacted for the benefit o...
2,"Appeals, Nos. 275 276 of 1963.\nAppeals by spe...","By section 25 (4) of the Income tax Act, ""Wher..."
3,No. 7338 of 1981.\n(Under Article 32 of the Co...,Fundamental Rule 56(j) confers power on the ap...
4,(C) No. 677 of 1988.\n(Under Article 32 of the...,The Lt. Governor of Delhi amended the Delhi Po...


In [10]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, PegasusTokenizerFast

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizerFast.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length = 512)
    decodings = tokenizer(labels, truncation=True, padding=True, max_length = 256)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer

In [11]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
train_texts, train_labels = (list(df['source'])), (list(df['target']))
  
model_name = 'nsi319/legal-pegasus'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.079, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 2.9653, 'learning_rate': 2e-05, 'epoch': 2.0}
{'train_runtime': 4053.1709, 'train_samples_per_second': 0.049, 'train_steps_per_second': 0.049, 'train_loss': 3.0221885681152343, 'epoch': 2.0}


TrainOutput(global_step=200, training_loss=3.0221885681152343, metrics={'train_runtime': 4053.1709, 'train_samples_per_second': 0.049, 'train_steps_per_second': 0.049, 'train_loss': 3.0221885681152343, 'epoch': 2.0})

In [12]:
import os
if not os.path.exists('./ouput_model/'):
    os.makedirs('./ouput_model/')
trainer.model.save_pretrained("./ouput_model/")
tokenizer.save_pretrained("./ouput_model/")


('./ouput_model/tokenizer_config.json',
 './ouput_model/special_tokens_map.json',
 './ouput_model/spiece.model',
 './ouput_model/added_tokens.json',
 './ouput_model/tokenizer.json')

In [13]:
!zip -r ouput_model.zip ./ouput_model/

  adding: ouput_model/ (stored 0%)
  adding: ouput_model/tokenizer_config.json (deflated 94%)
  adding: ouput_model/special_tokens_map.json (deflated 82%)
  adding: ouput_model/config.json (deflated 61%)
  adding: ouput_model/tokenizer.json (deflated 78%)
  adding: ouput_model/generation_config.json (deflated 44%)
  adding: ouput_model/spiece.model (deflated 50%)
  adding: ouput_model/pytorch_model.bin (deflated 7%)
