In [2]:
pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m284.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip uninstall -y transformers accelerate

Found existing installation: transformers 4.29.2
Uninstalling transformers-4.29.2:
  Successfully uninstalled transformers-4.29.2
Found existing installation: accelerate 0.20.3
Uninstalling accelerate-0.20.3:
  Successfully uninstalled accelerate-0.20.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install transformers accelerate

Collecting transformers
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting accelerate
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: safetensors, transformers, accelerate
Successfully installed accelerate-0.20.3 safetensors-0.3.1 transformers-4.30.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/surbhit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [9]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json: 100%|██████████| 88.0/88.0 [00:00<00:00, 4.84kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.12k/1.12k [00:00<00:00, 72.4kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 2.85MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 30.6kB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.28G/2.28G [06:37<00:00, 5.73MB/s]
Downloading (…)neration_config.json: 100%|██████████| 280/280 [00:00<00:00, 18.8kB/s]


In [17]:
samsum_dataset = load_from_disk('Samsum-Data/samsum_dataset/')
samsum_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [19]:
split_lengths = [len(samsum_dataset[split])for split in samsum_dataset]

print('Split lengths: {}'.format(split_lengths))
print('Features; {}'.format(samsum_dataset['train'].column_names))


Split lengths: [14732, 819, 818]
Features; ['id', 'dialogue', 'summary']


In [25]:
#Sample data
print("Dialogue")
print(samsum_dataset["train"][0]["dialogue"])

print("\nSummary")
print(samsum_dataset["train"][0]["summary"])

Dialogue
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)

Summary
Amanda baked cookies and will bring Jerry some tomorrow.


In [26]:
def convert_example_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation=True)

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask' : input_encodings['attention_mask'],
        'labels' : target_encodings['input_ids']
    }

In [27]:
samsum_dataset_pt = samsum_dataset.map(convert_example_to_features, batched=True)

                                                                   

In [28]:
samsum_dataset_pt['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [29]:
#Training
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [30]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=5, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [31]:

trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=samsum_dataset_pt["test"], 
                  eval_dataset=samsum_dataset_pt["validation"])

In [32]:
trainer.train()

  0%|          | 0/255 [00:00<?, ?it/s]You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.81 GiB total capacity; 2.79 GiB already allocated; 33.69 MiB free; 2.79 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [35]:
from box import ConfigBox