In [1]:
import numpy as np
import datasets 
from datasets import load_dataset
from accelerate import Accelerator
import torch
from torch.utils.data.dataloader import DataLoader
import transformers
from transformers import (
    AutoTokenizer,
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    default_data_collator,
    get_scheduler,
    set_seed,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
dataset = load_dataset("json", data_files="train.json")
dataset_test = load_dataset("json", data_files="test.json")
dataset_eval = load_dataset("json", data_files="eval.json")
dataset['eval'] = dataset_eval['train']
dataset['test'] = dataset_test['train']
dataset
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-6a36e5922eb05f45/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-891c751e5dc163e9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-0cae1a5ed046e7ee/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]



In [3]:
def tokenize_sample_data(data):
  # Max token size is 14536 and 215 for inputs and labels, respectively.
  # Here I restrict these token size.
  input_feature = mt5_tokenizer(data["article"], truncation=True, max_length=256)
  label = mt5_tokenizer(data["title"], truncation=True, max_length=64)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

tokenized_ds = dataset.map(
  tokenize_sample_data,
  remove_columns=["id", "title", "url", "article"],
  batched=True,
  batch_size=128)

tokenized_ds

Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-6a36e5922eb05f45\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-f294d5c24e8bbee2.arrow
Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-0cae1a5ed046e7ee\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-40c0019c2c1d69f4.arrow
Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-891c751e5dc163e9\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-a3e539b82b39e4b2.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30221
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3777
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3777
    })
})

In [4]:
# see https://huggingface.co/docs/transformers/main_classes/configuration
mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

In [5]:

data_collator = DataCollatorForSeq2Seq(
  mt5_tokenizer,
  model=model,
  return_tensors="pt")

In [6]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer
from utils import twrouge 

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = mt5_tokenizer(arg)
  return mt5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)
# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  labels = np.where(labels != -100, labels, mt5_tokenizer.pad_token_id)
  text_preds = mt5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = mt5_tokenizer.batch_decode(labels, skip_special_tokens=True)
  text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
  text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
  sent_tokenizer_tw = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(l))) for l in text_labels]
  # compute ROUGE score with custom tokenization

  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )


In [7]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)
for batch in sample_dataloader:
  print(batch)
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break

metrics_func([preds, labels])

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[   259, 210694,   5072,  ..., 147891,    292,      1],
        [   259,  22746,   9911,  ...,  44126,    261,      1],
        [   259,  48734, 191679,  ..., 111357,  57956,      1],
        [   259, 210694,   5072,  ...,   3541, 136360,      1],
        [ 78022,    276,    259,  ...,    261,   3236,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[   259,  73849,  26145,   2144,  36470,   3862,  99722, 102471, 102031,
            309,   6874, 201640,   1193, 127986,    879,   3916,  90821, 112530,
          21014,    267,   1637, 176430,  47728,  27333,    939,   4833,  23281,
           8882, 153832,  10559,   4153, 223367,    879,      1],
        [   259, 134176,  15778,  23175, 232014,    410,   1146,   5742,  33692,
          31225, 210707,    259, 183548,    292,  25231,  69196,    292,  

{'rouge1': 0.13491165114694526,
 'rouge2': 0.06066053511705686,
 'rougeL': 0.1362016806722689,
 'rougeLsum': 0.13500449415759513}

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
  output_dir = "mt5-summarize-tw",
  log_level = "error",
  num_train_epochs = 10,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 100,
  predict_with_generate=True,
  generation_max_length = 128,
  save_steps = 500,
  logging_steps = 10,
  push_to_hub = False
)
tokenized_ds["train"]

In [None]:
from torch.utils.data import DataLoader
traindataset = tokenized_ds["train"]
evaldataset = tokenized_ds["eval"]
train_dataloader = DataLoader(
    traindataset, shuffle=True, collate_fn=data_collator, batch_size=2
)
eval_dataloader = DataLoader(
    evaldataset, shuffle=True, collate_fn=data_collator, batch_size=2
)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
)
lr_scheduler = get_scheduler(
        name='linear',
        optimizer=optimizer,
        num_warmup_steps=90,
        num_training_steps=4720,
)
for epoch in range(4):
    model.train()
    for step, batch in enumerate(train_dataloader):
        output = model(**batch)
        loss = output.loss
        loss = loss / 16
        print(loss)
        accelerator.backward(loss)
        if step % 16 == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
  model = model,
  args = training_args,
  data_collator = data_collator,
  compute_metrics = metrics_func,
  train_dataset = tokenized_ds["train"],
  eval_dataset = tokenized_ds["eval"].select(range(20)),
  tokenizer = mt5_tokenizer,
)


trainer.train()

In [None]:
import os
from transformers import AutoModelForSeq2SeqLM

# save fine-tuned model in local
os.makedirs("./trained_for_summarization_tw", exist_ok=True)
if hasattr(trainer.model, "module"):
  trainer.model.module.save_pretrained("./trained_for_summarization_tw")
else:
  trainer.model.save_pretrained("./trained_for_summarization_tw")



In [15]:
from torch.utils.data import DataLoader

# load local model
model = (AutoModelForSeq2SeqLM
         .from_pretrained("./trained_for_summarization_tw")
         .to(device))

# Predict with test data (first 5 rows)
sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)
for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break
def eval(eval_arg):
    text_preds, text_labels = eval_arg
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_tw = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(l))) for l in text_labels]
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )
# Replace -100 (see above)
labels = np.where(labels != -100, labels, mt5_tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = mt5_tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = mt5_tokenizer.batch_decode(labels, skip_special_tokens=True)
print(eval([text_preds, text_labels]))
# Show result
print("***** Input's Text *****")
print(dataset["test"]["article"][4])
print("***** Summary Text (True Value) *****")
print(text_labels[4])
print("***** Summary Text (Generated Text) *****")
print(text_preds[4])
print(len(text_labels))

{'rouge1': 0.44615599088573976, 'rouge2': 0.2272960247928959, 'rougeL': 0.3639858793497608, 'rougeLsum': 0.39796679438058746}
***** Input's Text *****
Molly　圖若是問到男人心中的理想女友輪廓，應該很多女生的第一反應都會說出甜美臉蛋、好身材、長髮或是氣質好等這些「大眾印象」中的條件，但實際上卻跟我們想像的差很大！國外男性網站《AskMen》曾票選出「理想女友」的10個特質，不僅外在條件佔比不高，第一名更讓人完全沒想到。雖然男生都喜歡被需要、被依靠的感覺，但若是沒有男友就什麼都不行，無時無刻都要緊黏著另一半，這樣「過度依賴」，只會讓人覺得厭煩。男生喜歡的是平時很獨立，但有時又會柔弱、需要他們保護的女生。男生喜歡聰明的女生並不是指要EQ 180、上知天文下知地理，而是能言之有物，擁有有趣的內在。如果空有美麗外表，內在空洞，聊天都聊不下去了，又如何愛上你。兩人在一起，除了個性、三觀是否契合外，性生活也是很重要的一環，和諧的性生活能讓雙方感情更加親密，但也不是指女生要一昧配合男生，而是能找雙方都能享受、喜歡的方式才對。有吸引力並不只侷限於漂亮的臉蛋、性感的身材，像是有人是腿控，腿美就加分；有人喜愛開朗的笑容，只要面對甜甜微笑就被融化，對男生而言，外表只是第一眼，魅力才是吸引他們的重點。人和人之間的相處，最基本的就是尊重，就算再親密也不能忽略。大部分男生都很討厭被管束的感覺，尤其兩人在一起最重要的就是能互相尊重，應該要尊重對方有自己的時間、意志和選擇，不要以「愛」之名，做出各種限制、干涉，給予適當的自由空間，兩人感情也能更長久。還記得學生時期，我們都很討厭被爸媽碎碎念嗎？當然情侶之間也一樣，如果無法理性溝通，只是一直嘮叨碎念，不僅會讓他失去耐心，妳也會變成他另一個「媽媽」。不論男女，每個人都會希望伴侶跟自己的家人、好友群們能相處融洽，彼此可以自在地融入對方的生活圈，畢竟人活在世上無法只顧愛情，誰都不太可能拋棄家人和朋友，只和另一半生活。沒有人是完美的，再優秀的人都會其不足之處，你無法只挑對方的優點愛而不接受他的缺點或壞習慣，要懂得包容、尊重對方，雙方一起找到平衡點，畢竟也不是只有你忍受他的壞習慣，同時他也在忍受你的。兩個人在

In [17]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)
for batch in sample_dataloader:
  print(batch)
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=15,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=128,
    )
  labels = batch["labels"]
  break
print(preds)
print(metrics_func([preds, labels]))
print(type(metrics_func([preds, labels])))

{'input_ids': tensor([[   259, 210694,   5072,  ..., 147891,    292,      1],
        [   259,  22746,   9911,  ...,  44126,    261,      1],
        [   259,  48734, 191679,  ..., 111357,  57956,      1],
        [   259, 210694,   5072,  ...,   3541, 136360,      1],
        [ 78022,    276,    259,  ...,    261,   3236,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[   259,  73849,  26145,   2144,  36470,   3862,  99722, 102471, 102031,
            309,   6874, 201640,   1193, 127986,    879,   3916,  90821, 112530,
          21014,    267,   1637, 176430,  47728,  27333,    939,   4833,  23281,
           8882, 153832,  10559,   4153, 223367,    879,      1],
        [   259, 134176,  15778,  23175, 232014,    410,   1146,   5742,  33692,
          31225, 210707,    259, 183548,    292,  25231,  69196,    292,  