In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from datasets import load_metric
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence

import json
import re
import nltk

In [None]:
pretrained_model_name = "cointegrated/rut5-small"
tokenizer = T5Tokenizer.from_pretrained(pretrained_model_name)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
max_input_length = 1000
max_target_length = 400
batch_size = 4
epochs = 5
learning_rate = 2e-5

In [None]:
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    texts_list = []
    target_list = []
    for item in data:
        if 'text' in item:
            texts_list.append(item['text'])
            target_list.append(item['summary'])
    return texts_list, target_list

In [None]:
def chunk_text(text, chunk_size=512, padding_token="[PAD]"):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    max_length = max(len(chunk) for chunk in chunks)

    padded_chunks = [chunk + padding_token * (max_length - len(chunk)) for chunk in chunks]

    return padded_chunks

In [None]:
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]

    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    padded_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    padded_input_ids = padded_input_ids.view(batch_size, -1)
    padded_attention_masks = padded_attention_masks.view(batch_size, -1)

    return {'input_ids': padded_input_ids, 'attention_mask': padded_attention_masks, 'labels': padded_labels}

In [None]:
texts_list_train, summary_list_train = load_data('gazeta_train.jsonl')
texts_list_val, summary_list_val = load_data('gazeta_val.jsonl')
texts_list_test, summary_list_test = load_data('gazeta_test.jsonl')

In [None]:
class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length, max_target_length):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        text_chunks = chunk_text(text, chunk_size=self.max_input_length)

        inputs = []
        attention_masks = []
        for chunk in text_chunks:
            encoded_dict = self.tokenizer.encode_plus(
                "summarize: " + chunk,
                max_length=self.max_input_length,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )
            inputs.append(encoded_dict['input_ids'].squeeze(0))
            attention_masks.append(encoded_dict['attention_mask'].squeeze(0))

        input_ids = pad_sequence(inputs, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_masks, batch_first=True, padding_value=0)

        labels = self.tokenizer.encode(summary, max_length=self.max_target_length, padding='max_length', truncation=True, return_tensors="pt")[0]
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length, max_target_length):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        encoded_inputs = self.tokenizer.encode_plus(
            "summarize: " + text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoded_inputs['input_ids'].squeeze(0)
        attention_mask = encoded_inputs['attention_mask'].squeeze(0)

        labels = self.tokenizer.encode_plus(
            summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )['input_ids'].squeeze(0)

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
train_dataset = SummaryDataset(texts_list_train, summary_list_train, tokenizer, max_input_length, max_target_length)
val_dataset = SummaryDataset(texts_list_val, summary_list_val, tokenizer, max_input_length, max_target_length)
test_dataset = SummaryDataset(texts_list_test, summary_list_test, tokenizer, max_input_length, max_target_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
rouge = load_metric('rouge')
writer = SummaryWriter(log_dir='logs_model')

  rouge = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        writer.add_scalar('Train/Loss', loss.item(), epoch * len(train_loader) + batch_idx)

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_train_loss}')
    try:
        model.eval()
        total_val_loss = 0
        val_outputs = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

                generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length, num_beams=4)
                val_outputs.extend([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids])
                decoded_labels = []
                for l in labels:
                    l = l[l != -100]
                    decoded_labels.append(tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=False))

                val_labels.extend(decoded_labels)

        avg_val_loss = total_val_loss / len(val_loader)
        print(f'Validation Loss: {avg_val_loss}')

        writer.add_scalar('Validation/Loss', avg_val_loss, epoch)

        rouge_result = rouge.compute(predictions=val_outputs, references=val_labels)
        print(f'ROUGE-1: {rouge_result["rouge1"].mid.fmeasure:.4f}, ROUGE-2: {rouge_result["rouge2"].mid.fmeasure:.4f}, ROUGE-L: {rouge_result["rougeL"].mid.fmeasure:.4f}')

        writer.add_scalar('Validation/ROUGE-1', rouge_result["rouge1"].mid.fmeasure, epoch)
        writer.add_scalar('Validation/ROUGE-2', rouge_result["rouge2"].mid.fmeasure, epoch)
        writer.add_scalar('Validation/ROUGE-L', rouge_result["rougeL"].mid.fmeasure, epoch)
    except:
        print("Fail validation")

writer.close()
model.save_pretrained("val_trained_model")
tokenizer.save_pretrained("val_tokenizer")

Epoch 1/5, Loss: 2.792672300893842
Validation Loss: 2.120660104259181
ROUGE-1: 0.1637, ROUGE-2: 0.0392, ROUGE-L: 0.1602
Epoch 2/5, Loss: 2.61161929162404
Validation Loss: 2.0827244753718106
ROUGE-1: 0.1736, ROUGE-2: 0.0434, ROUGE-L: 0.1698
Epoch 3/5, Loss: 2.549147611774561
Validation Loss: 2.0509361693962287
ROUGE-1: 0.1751, ROUGE-2: 0.0442, ROUGE-L: 0.1709
Epoch 4/5, Loss: 2.5137982921745943
Validation Loss: 2.0382783048154733
ROUGE-1: 0.1777, ROUGE-2: 0.0441, ROUGE-L: 0.1733
Epoch 5/5, Loss: 2.4970993428011887
Validation Loss: 2.0365842202439306
ROUGE-1: 0.1772, ROUGE-2: 0.0444, ROUGE-L: 0.1734


('val_tokenizer\\tokenizer_config.json',
 'val_tokenizer\\special_tokens_map.json',
 'val_tokenizer\\spiece.model',
 'val_tokenizer\\added_tokens.json')

In [None]:
model.save_pretrained("df_trained_model")

In [None]:
def summarize_text(model, tokenizer, text, device, max_input_length=9559, max_output_length=554, num_beams=4):
    model.to(device)
    model.eval()

    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        max_length=max_input_length,
        truncation=True,
        padding='max_length'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_length,
            num_beams=num_beams,
            length_penalty=2.0,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

    return summary

In [None]:
text = texts_list_test[15]
summarized_text = summarize_text(model, tokenizer, text, device)

In [None]:
len(max(texts_list_test, key=len))

9559

In [None]:
len(max(summary_list_test, key=len))

554

In [None]:
summary_result_test = []

In [None]:
for i in range(0, len(texts_list_test)):
    print(f"text {i} in {len(texts_list_test)} texts")
    summary_result = summarize_text(model, tokenizer, texts_list_test[i], device)
    summary_result_test.append(summary_result)

text 0 in 5770 texts
text 1 in 5770 texts
text 2 in 5770 texts
text 3 in 5770 texts
text 4 in 5770 texts
text 5 in 5770 texts
text 6 in 5770 texts
text 7 in 5770 texts
text 8 in 5770 texts
text 9 in 5770 texts
text 10 in 5770 texts
text 11 in 5770 texts
text 12 in 5770 texts
text 13 in 5770 texts
text 14 in 5770 texts
text 15 in 5770 texts
text 16 in 5770 texts
text 17 in 5770 texts
text 18 in 5770 texts
text 19 in 5770 texts
text 20 in 5770 texts
text 21 in 5770 texts
text 22 in 5770 texts
text 23 in 5770 texts
text 24 in 5770 texts
text 25 in 5770 texts
text 26 in 5770 texts
text 27 in 5770 texts
text 28 in 5770 texts
text 29 in 5770 texts
text 30 in 5770 texts
text 31 in 5770 texts
text 32 in 5770 texts
text 33 in 5770 texts
text 34 in 5770 texts
text 35 in 5770 texts
text 36 in 5770 texts
text 37 in 5770 texts
text 38 in 5770 texts
text 39 in 5770 texts
text 40 in 5770 texts
text 41 in 5770 texts
text 42 in 5770 texts
text 43 in 5770 texts
text 44 in 5770 texts
text 45 in 5770 text

KeyboardInterrupt: 

In [None]:
with open('model_result.jsonl', 'w', encoding='utf-8') as f:
    for i in range(len(summary_result_test)):
        obj = {
            'text': texts_list_test[i],
            'expected_summary': summary_list_test[i],
            'generated_summary': summary_result_test[i]
        }
        f.write(json.dumps(obj, ensure_ascii=False) + '\n')

In [None]:
len(summary_result_test)

4223