In [None]:
# 필요한 패키지 설치
!pip install datasets transformers rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=b7933b1dfdddf960a60062f7a6054470fd7f3d12342b1fa4a7bdfd3013006f7d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# Transformers에서 필요한 라이브러리 import
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 데이터셋 로드를 위한 datasets 라이브러리 import
from datasets import load_dataset

# ROUGE 점수 계산을 위한 rouge_score 라이브러리 import
from rouge_score import rouge_scorer

In [None]:
# ROUGE 점수를 계산하는 함수 정의
def calculate_rouge(reference_summary, generated_summary):

    # ROUGE 스코어 객체 생성
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # 두 개의 요약에 대한 ROUGE 점수 계산
    scores = scorer.score(reference_summary, generated_summary)

    # 각 ROUGE 점수 추출
    rouge1_score = scores['rouge1'].fmeasure
    rouge2_score = scores['rouge2'].fmeasure
    rougeL_score = scores['rougeL'].fmeasure

    return rouge1_score, rouge2_score, rougeL_score

In [None]:
# 데이터셋 로드
billsum = load_dataset("billsum", split="ca_test")

# 데이터셋을 학습 및 테스트 세트로 분할
billsum = billsum.train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [None]:
# BART 모델 로드
tokenizer_bart = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# T5 모델 로드
tokenizer_t5 = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")

# Pegasus 모델 로드
tokenizer_pegasus = AutoTokenizer.from_pretrained("google/pegasus-large")
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [None]:
# 요약을 생성하는 함수 정의
def summarize_text(text, model, tokenizer, max_length=128):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=max_length, truncation=True)
    summary_ids = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# 각 모델로 요약 생성 및 평가 수행
for model_name, model, tokenizer in [("BART", model_bart, tokenizer_bart),
                                     ("T5 Fine-Tuned", model_t5, tokenizer_t5),
                                     ("Pegasus", model_pegasus, tokenizer_pegasus)]:
    print(f"Testing {model_name}...")

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    # 테스트 샘플 반복
    for sample in billsum['test']:
        input_text = sample['text'] # 입력 텍스트
        reference_summary = sample['summary'] # 참조 요약

        # 요약 생성
        generated_summary = summarize_text(input_text, model, tokenizer)

        # ROUGE 점수 계산
        rouge1, rouge2, rougeL = calculate_rouge(reference_summary, generated_summary)
        rouge1_scores.append(rouge1)
        rouge2_scores.append(rouge2)
        rougeL_scores.append(rougeL)

    # 평균 ROUGE 점수 계산
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

    # 평균 ROUGE 점수 출력
    print(f"Average ROUGE-1 F1 Score: {avg_rouge1}")
    print(f"Average ROUGE-2 F1 Score: {avg_rouge2}")
    print(f"Average ROUGE-L F1 Score: {avg_rougeL}")
    print("\n")

Testing BART...
Average ROUGE-1 F1 Score: 0.11750928349368582
Average ROUGE-2 F1 Score: 0.04913005085119549
Average ROUGE-L F1 Score: 0.09122514504099893


Testing T5 Fine-Tuned...
Average ROUGE-1 F1 Score: 0.139503780793534
Average ROUGE-2 F1 Score: 0.0589631406649311
Average ROUGE-L F1 Score: 0.10417008420576351


Testing Pegasus...
Average ROUGE-1 F1 Score: 0.1325472312104066
Average ROUGE-2 F1 Score: 0.058030374993381935
Average ROUGE-L F1 Score: 0.09940159045033917


