In [None]:
# You might also need to uninstall transformers first: 
# !pip uninstall -y transformers
!pip install transformers==4.28.0

In [None]:
!pip install git+https://github.com/huggingface/accelerate

In [None]:
!pip install datasets evaluate rouge_score

In [2]:
# Importing platform library 
from platform import python_version  
# Getting Python interpreter version as a result 
print("Current Version of Python interpreter we are using-", python_version()) 

Current Version of Python interpreter we are using- 3.10.11


In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from datasets import load_dataset

dataset = load_dataset("Katerina-gopher/ru-summary")

Downloading readme:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading and preparing dataset csv/Katerina-gopher--ru-summary to /root/.cache/huggingface/datasets/Katerina-gopher___csv/Katerina-gopher--ru-summary-41b168de469a9e8c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/113M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/Katerina-gopher___csv/Katerina-gopher--ru-summary-41b168de469a9e8c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['summ', 'text'],
        num_rows: 121833
    })
    validation: Dataset({
        features: ['summ', 'text'],
        num_rows: 12103
    })
    test: Dataset({
        features: ['summ', 'text'],
        num_rows: 17512
    })
})

In [7]:
ds = dataset
ds = ds.pop('train')
ds

Dataset({
    features: ['summ', 'text'],
    num_rows: 121833
})

In [8]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['summ', 'text'],
        num_rows: 12103
    })
    test: Dataset({
        features: ['summ', 'text'],
        num_rows: 17512
    })
})

In [9]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [10]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summ"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12103 [00:00<?, ? examples/s]

Map:   0%|          | 0/17512 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForSeq2Seq

# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="Katerina-gopher/t5-ru-summary")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

In [13]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [14]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# model = AutoModelForSeq2SeqLM.from_pretrained("t5-ru-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
model.save_pretrained("t5-ru-small")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-ru-summary",
    # output_dir="t5-small",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_dataset["train"],
    # eval_dataset=tokenized_dataset["test"],
    train_dataset=tokenized_dataset["test"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()