In [2]:
import numpy as np
import datasets 
from datasets import load_dataset
from accelerate import Accelerator
import torch
from torch.utils.data.dataloader import DataLoader
import transformers
from transformers import (
    AutoTokenizer,
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    default_data_collator,
    get_scheduler,
    set_seed,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("json", data_files="train.json")
dataset_test = load_dataset("json", data_files="test.json")
dataset_eval = load_dataset("json", data_files="eval.json")
dataset['eval'] = dataset_eval['train']
dataset['test'] = dataset_test['train']
dataset
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

def tokenize_sample_data(data):
  # Max token size is 14536 and 215 for inputs and labels, respectively.
  # Here I restrict these token size.
  input_feature = mt5_tokenizer(data["article"], truncation=True, max_length=256)
  label = mt5_tokenizer(data["title"], truncation=True, max_length=64)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

tokenized_ds = dataset.map(
  tokenize_sample_data,
  remove_columns=["id", "title", "url", "article"],
  batched=True,
  batch_size=128)



import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer


rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = mt5_tokenizer(arg)
  return mt5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)
# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  labels = np.where(labels != -100, labels, mt5_tokenizer.pad_token_id)
  text_preds = mt5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = mt5_tokenizer.batch_decode(labels, skip_special_tokens=True)
  text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
  text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
  sent_tokenizer_tw = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(l))) for l in text_labels]
  # compute ROUGE score with custom tokenization

  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

from torch.utils.data import DataLoader





Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-6a36e5922eb05f45/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-891c751e5dc163e9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (C:/Users/KevinChou/.cache/huggingface/datasets/json/default-0cae1a5ed046e7ee/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-6a36e5922eb05f45\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-f294d5c24e8bbee2.arrow
Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-0cae1a5ed046e7ee\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-40c0019c2c1d69f4.arrow
Loading cached processed dataset at C:\Users\KevinChou\.cache\huggingface\datasets\json\default-891c751e5dc163e9\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-a3e539b82b39e4b2.arrow


In [None]:
mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=10,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

data_collator = DataCollatorForSeq2Seq(
  mt5_tokenizer,
  model=model,
  return_tensors="pt")

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)

In [9]:
from tqdm import tqdm
print(len(sample_dataloader))

count = 0
total_metrice = {
  'rouge1': 0,
  'rouge2': 0,
  'rougeL': 0,
  'rougeLsum': 0
}
for batch in tqdm(sample_dataloader):
  #print(batch)
  count += 1
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=10,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=64,
    )
  labels = batch["labels"]
  met = metrics_func([preds, labels])
  total_metrice['rouge1'] += met['rouge1']
  total_metrice['rouge2'] += met['rouge2']
  total_metrice['rougeL'] += met['rougeL']
  total_metrice['rougeLsum'] += met['rougeLsum']
total_metrice['rouge1'] /= count
total_metrice['rouge2'] /= count
total_metrice['rougeL'] /= count
total_metrice['rougeLsum'] /= count
total_metrice
#metrics_func([preds, labels])

756


100%|██████████| 756/756 [22:10<00:00,  1.76s/it]


{'rouge1': 0.1406464367690917,
 'rouge2': 0.06483060634646234,
 'rougeL': 0.1382132647823016,
 'rougeLsum': 0.13735610576505136}

In [10]:
mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=10,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("./iter_trained_for_summarization_tw", config=mt5_config)
         .to(device))

data_collator = DataCollatorForSeq2Seq(
  mt5_tokenizer,
  model=model,
  return_tensors="pt")

sample_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=5)

In [11]:
from tqdm import tqdm
print(len(sample_dataloader))
count = 0
total_metrice = {
  'rouge1': 0,
  'rouge2': 0,
  'rougeL': 0,
  'rougeLsum': 0
}
for batch in tqdm(sample_dataloader):
  #print(batch)
  count += 1
  with torch.no_grad():
    preds = model.generate(
      batch["input_ids"].to(device),
      num_beams=10,
      num_return_sequences=1,
      no_repeat_ngram_size=1,
      remove_invalid_values=True,
      max_length=64,
    )
  labels = batch["labels"]
  met = metrics_func([preds, labels])
  total_metrice['rouge1'] += met['rouge1']
  total_metrice['rouge2'] += met['rouge2']
  total_metrice['rougeL'] += met['rougeL']
  total_metrice['rougeLsum'] += met['rougeLsum']
total_metrice['rouge1'] /= count
total_metrice['rouge2'] /= count
total_metrice['rougeL'] /= count
total_metrice['rougeLsum'] /= count
total_metrice
#metrics_func([preds, labels])

756


100%|██████████| 756/756 [27:09<00:00,  2.15s/it]


{'rouge1': 0.44033605066969383,
 'rouge2': 0.22614575892683023,
 'rougeL': 0.3753717622318992,
 'rougeLsum': 0.4061266685861491}

In [1]:
import numpy as np
from transformers import AutoTokenizer
from nltk.tokenize import RegexpTokenizer
import evaluate

mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

rouge_metric = evaluate.load("rouge")


def tokenize_sentence(arg):
  encoded_arg = mt5_tokenizer(arg)
  return mt5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def eval_func(eval_arg):
    text_preds, text_labels = eval_arg
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_tw = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_tw.tokenize(l))) for l in text_labels]
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )



In [3]:
#import eval
import json 
from tqdm import tqdm
topic_labels = []
topic_prebs = []
with open("extract.json", encoding='utf-8') as file:
    datas = list(file)
total_metrice_extract = {
  'rouge1': 0,
  'rouge2': 0,
  'rougeL': 0,
  'rougeLsum': 0
}
for data in tqdm(datas):
    data = json.loads(data)
    topic_labels.append(data["old_title"])
    topic_prebs.append(data["new_title"])
#topic_labels
#topic_prebs
count = 0
for label, prebs in tqdm(zip(topic_labels, topic_prebs)):
    label = [label]
    prebs = [prebs]
    met = eval_func([prebs, label])
    count += 1
    total_metrice_extract['rouge1'] += met['rouge1']
    total_metrice_extract['rouge2'] += met['rouge2']
    total_metrice_extract['rougeL'] += met['rougeL']
    total_metrice_extract['rougeLsum'] += met['rougeLsum']
    #print(score)
total_metrice_extract['rouge1'] /= count
total_metrice_extract['rouge2'] /= count
total_metrice_extract['rougeL'] /= count
total_metrice_extract['rougeLsum'] /= count
total_metrice_extract

100%|██████████| 3777/3777 [00:00<00:00, 236696.89it/s]
302it [00:30,  9.83it/s]


KeyboardInterrupt: 