In [13]:
import os
from fsspec.implementations.local import LocalFileSystem
print(LocalFileSystem().glob(os.path.join("./folder", "**train*")))

[]


In [16]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"특성: {dataset['train'].column_names}")

ValueError: At least one data file must be specified, but got data_files=None

In [15]:
sample = dataset["train"][1]

print(f"""기사 (500개 문자 발췌, 총 길이: {len(sample["article"])}):""")
print(sample["article"][:500])
print(f'\n요약 (길이: {len(sample["highlights"])}):')
print(sample["highlights"])

NameError: name 'dataset' is not defined

In [None]:
sample_text = dataset["train"][1]["article"][:2000]
summaries = {}

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

In [None]:
nltk.download("punkt")

string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/parkhyerin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['The U.S. are a country.', 'The U.N. is an organization.']

In [None]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

summaries["baseline"] = three_sentence_summary(sample_text)

summaries["baseline"]

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'

In [None]:
from transformers import pipeline, set_seed

# GPT-2

set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(
    sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :])
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
summaries["gpt2"]

'1.\nIn Miami, where about one-third of all inmates are mentally ill, many prisoners on the "forgotten floor" are mentally ill and suffering from mental illness.\nI was in Miami for a trial.\nIt was a high-profile case about murder -- it would be the most high-profile murder case in Miami-Dade history.\nI\'ve been covering a lot'

In [None]:
# T5

pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 1.44MB/s]
model.safetensors: 100%|██████████| 2.95G/2.95G [00:28<00:00, 105MB/s] 
generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 60.5kB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.00MB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 1.74MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
summaries["t5"]

'mentally ill inmates are housed on the ninth floor of a florida jail .\nmost face drug charges or charges of assaulting an officer .\njudge says arrests often result from confrontations with police .\none-third of all people in Miami-dade county jails are mental ill .'

In [None]:
# BART
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

config.json: 100%|██████████| 1.58k/1.58k [00:00<00:00, 197kB/s]
model.safetensors: 100%|██████████| 1.63G/1.63G [00:19<00:00, 82.7MB/s]
generation_config.json: 100%|██████████| 363/363 [00:00<00:00, 148kB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 4.33MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 763kB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 6.79MB/s]


In [None]:
summaries["bart"]

'Mentally ill inmates are housed on the "forgotten floor" of Miami-Dade jail.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.\nHe says about one-third of all people in the county jails are mentally ill.'

In [None]:
# Pegasus
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
summaries["pegasus"]

'Mentally ill inmates in Miami are housed on the "forgotten floor"<n>The ninth floor is where they\'re held until they\'re ready to appear in court.\nMost often, they face drug charges or charges of assaulting an officer.\nThey end up on the ninth floor severely mentally disturbed .'

## 6.5 CNN/DailyMail 데이터셋에서 PEGASUS 평가하기

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=1b0d133bf95c17d061042fc34218409cf59cc492180b71990cb73a4f992c6935
  Stored in directory: /Users/parkhyerin/Library/Caches/pip/wheels/9b/3d/39/09558097d3119ca0a4d462df68f22c6f3c1b345ac63a09b86e
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.1.0 rouge_score-0.1.2


In [None]:
import pandas as pd
from datasets import load_metric

rouge_metric = load_metric("rouge")

reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index=summaries.keys())

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


NameError: name 'dataset' is not defined

In [None]:
def evaluate_summaries_baseline(dataset,
                                metric,
                                column_text="article",
                                column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries,
                     references=dataset[column_summary])
    score = metric.compute()
    return score

In [None]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))

score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T