In [1]:
from datasets import load_dataset, load_from_disk
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Data

### XSUM 

In [12]:
# data = load_dataset("xsum")
# data.save_to_disk('data/xsum')

Downloading builder script: 100%|██████████| 5.76k/5.76k [00:00<00:00, 2.79MB/s]
Downloading readme: 100%|██████████| 6.24k/6.24k [00:00<?, ?B/s]


Downloading and preparing dataset xsum/default to C:/Users/kevin/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data: 100%|██████████| 255M/255M [00:15<00:00, 16.5MB/s]
Downloading data: 2.72MB [00:00, 19.3MB/s]/2 [00:20<00:20, 20.51s/it]
Downloading data files: 100%|██████████| 2/2 [00:24<00:00, 12.38s/it]
                                                                                          

Dataset xsum downloaded and prepared to C:/Users/kevin/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00,  9.55it/s]

['document', 'summary', 'id']





In [2]:
data = load_from_disk('data/xsum')
df_train = pd.DataFrame(data=data['train'])
df_val = pd.DataFrame(data=data['validation'])
df_test = pd.DataFrame(data=data['test'])

df_train.head()

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


### CNN Dailymail

In [3]:
# data = load_dataset("cnn_dailymail", "3.0.0")
# data.save_to_disk('data/cnn_dailymail')

Downloading builder script: 100%|██████████| 8.33k/8.33k [00:00<?, ?B/s]
Downloading metadata: 100%|██████████| 9.88k/9.88k [00:00<00:00, 9.88MB/s]
Downloading readme: 100%|██████████| 15.1k/15.1k [00:00<00:00, 15.4MB/s]


Downloading and preparing dataset cnn_dailymail/3.0.0 to C:/Users/kevin/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data: 100%|██████████| 159M/159M [00:06<00:00, 23.6MB/s]
Downloading data: 100%|██████████| 376M/376M [00:15<00:00, 23.6MB/s]]
Downloading data: 46.4MB [00:01, 40.1MB/s]/5 [00:28<00:45, 15.01s/it]
Downloading data: 2.43MB [00:00, 8.22MB/s]                          ]
Downloading data: 2.11MB [00:00, 10.9MB/s]                          ]
Downloading data files: 100%|██████████| 5/5 [00:38<00:00,  7.63s/it]
                                                                                           

Dataset cnn_dailymail downloaded and prepared to C:/Users/kevin/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 11.93it/s]
                                                                                                   

In [4]:
data = load_from_disk('data/cnn_dailymail')
df_train = pd.DataFrame(data=data['train'])
df_val = pd.DataFrame(data=data['validation'])
df_test = pd.DataFrame(data=data['test'])

df_train.head()

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


### Rename data columns

In [3]:
df_train.columns = ['text', 'summary', 'id']
df_val.columns = ['text', 'summary', 'id']
df_test.columns = ['text', 'summary', 'id']

## T5

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, text_list, summary_list):
        self.text_list = text_list
        self.summary_list = summary_list

    def __len__(self):
        return len(self.text_list)

    def __getitem__(self, index):
        return self.text_list[index], self.summary_list[index]

def encode_data(text, summary):
    input_ids = tokenizer.encode(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")[0]
    output_ids = tokenizer.encode(summary, truncation=True, padding="max_length", max_length=64, return_tensors="pt")[0]
    return input_ids, output_ids

def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch[0].to(device)
        output_ids = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=output_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            output_ids = batch[1].to(device)

            outputs = model(input_ids=input_ids, labels=output_ids)
            loss = outputs.loss
            total_loss += loss.item()

    return total_loss / len(dataloader)

def generate_summary(text):
    input_ids = tokenizer.encode(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)
    output_ids = model.generate(input_ids=input_ids, max_length=64, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output_ids.squeeze(), skip_special_tokens=True)
    return summary

In [None]:

text_list = [...]  # List of texts
summary_list = [...]  # List of summaries

dataset = TextSummaryDataset(text_list, summary_list)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

for epoch in range(5):
    train_loss = train(model, dataloader, optimizer, criterion)
    val_loss = evaluate(model, dataloader, criterion)
    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

train_encodings = tokenizer(df_train['text'].to_list(), truncation=True, padding=True)
val_encodings = tokenizer(df_val['text'].to_list(), truncation=True, padding=True)

train_labels = tokenizer(df_train['summary'].to_list(), truncation=True, padding=True)
val_labels = tokenizer(df_val['summary'].to_list(), truncation=True, padding=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
)

trainer.train()
model.save_pretrained('models/t5_model')

  0%|          | 0/3 [00:00<?, ?it/s]

KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

## BART

## GPT2