# Hugging Face - Summarization in Turkish

This source code builds the fine-tuned model of [google/mt5-small](https://huggingface.co/google/mt5-small) for Turkish summarization.

For more background and details, see [this blog post](https://tsmatz.wordpress.com/2022/11/25/huggingface-japanese-summarization/).

In [None]:
# Mount drive since this notebook is organized to be used in colab environment.
from google.colab import drive
drive.mount('/content/drive')

Install packages depending on T5 tokenizer.

In [None]:
!pip install protobuf==3.20.3

Install packages depending on rouge evaluation.

In [None]:
!pip install absl-py rouge_score nltk

## Check device

Check whether GPU is available.

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Prepare data

In [None]:
!pip3 install datasets

There are various datasets for Turkish text summarization. Based on the results of [1], I chose TR-News dataset.

[1] B. Baykara and T. Güngör, "Turkish abstractive text summarization using pretrained sequence-to-sequence models," Cambridge University Press, 2022.

In [None]:
from datasets import load_dataset

#
# I chose
#ds = load_dataset("csebuetnlp/xlsum", name="turkish")
#ds = load_dataset("mlsum", "tu")
ds = load_dataset("batubayk/TR-News")
ds

In [None]:
ds["train"][0]

In [None]:
from transformers import AutoTokenizer

t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

## Fine-tune

For fine-tuning, apply tokenization for dataset.

In [None]:
def tokenize_sample_data(data):
  input_feature = t5_tokenizer(data["content"], truncation=True, max_length=1024)
  label = t5_tokenizer(data["abstract"], truncation=True, max_length=128)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

tokenized_ds = ds.map(
  tokenize_sample_data,
  remove_columns=["abstract", "author", "content", "date", "source", "tags", "title", "topic", "url"],
  batched=True,
  batch_size=128)

tokenized_ds

In [None]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM

mt5_config = AutoConfig.from_pretrained(
    "google/mt5-small",
    max_length=128,
    length_penalty=0.6,
    no_repeat_ngram_size=2,
    num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

# To avoid ValueErros caused by possible non-contiguous tensors.
for param in model.parameters():
    param.data = param.data.contiguous()

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    t5_tokenizer,
    model=model,
    return_tensors="pt")

In [None]:
!pip3 install evaluate

In [None]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg = t5_tokenizer(arg)
    return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    # (Note : Please change this code, when you perform on other languages)
    text_preds = [(p if p.endswith(("!", "?", ".")) else p + ".") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "?", ".")) else l + ".") for l in text_labels]
    sent_tokenizer_tr = RegexpTokenizer(u'[^!?.]*[!?.]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_tr.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_tr.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

In [None]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
    tokenized_ds["test"].with_format("torch"),
    collate_fn=data_collator,
    batch_size=5)
for batch in sample_dataloader:
    with torch.no_grad():
        preds = model.generate(
            batch["input_ids"].to(device),
            num_beams=15,
            num_return_sequences=1,
            no_repeat_ngram_size=1,
            remove_invalid_values=True,
            max_length=128,
        )
    labels = batch["labels"]
    break

metrics_func([preds, labels])

In [None]:
from transformers import Seq2SeqTrainingArguments

# "save_steps" and "save_total_limit" parameters
# can be chosen arbitrarily based on the memory constraints
training_args = Seq2SeqTrainingArguments(
    output_dir = "mt5-summarize-tr-trnews",
    log_level = "error",
    num_train_epochs = 10,
    learning_rate = 5e-4,
    lr_scheduler_type = "linear",
    warmup_steps = 90,
    optim = "adafactor",
    weight_decay = 0.01,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 16,
    evaluation_strategy = "steps",
    eval_steps = 100,
    predict_with_generate=True,
    generation_max_length = 128,
    save_steps = 500,
    logging_steps = 10,
    push_to_hub = False,
    save_total_limit=3
)

Build trainer. (Put it all together.)

In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["validation"].select(range(20)),
    tokenizer = t5_tokenizer,
)

Run training.<br>

In [None]:
trainer.train()

## Generate Text (Summarize) with Fine-Tuned Model

Now let's see how it generates text for summarization with fine-tuned model.<br>

In order to use it later, you can save the trained model.

In [None]:
import os

save_directory = "path/to/save"
os.makedirs(save_directory, exist_ok=True)

if hasattr(trainer.model, "module"):
    trainer.model.module.save_pretrained(save_directory)
else:
    trainer.model.save_pretrained(save_directory)

Load pre-trained model from local.

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = (AutoModelForSeq2SeqLM
         .from_pretrained("./path")
         .to(device))

In [None]:
from torch.utils.data import DataLoader

# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
    tokenized_ds["test"].with_format("torch"),
    collate_fn=data_collator,
    batch_size=5)
for batch in sample_dataloader:
    with torch.no_grad():
        preds = model.generate(
            batch["input_ids"].to(device),
            num_beams=15,
            num_return_sequences=1,
            no_repeat_ngram_size=1,
            remove_invalid_values=True,
            max_length=128,
        )
    labels = batch["labels"]
    break

labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

# Show result
print("***** Input's Text *****")
print(ds["test"]["text"][0])
print("***** Summary Text (True Value) *****")
print(text_labels[0])
print("***** Summary Text (Generated Text) *****")
print(text_preds[0])

In [None]:
print("***** Input's Text *****")
print(ds["test"]["text"][2])
print("***** Summary Text (True Value) *****")
print(text_labels[2])
print("***** Summary Text (Generated Text) *****")
print(text_preds[2])