<a href="https://colab.research.google.com/github/kimdonggyu2008/2024_2_Capstone/blob/main/%EC%9A%94%EC%95%BD%EB%AA%A8%EB%8D%B8%EA%B3%BC_%EB%8D%B0%EC%9D%B4%ED%84%B0%EC%85%8B_%ED%95%99%EC%8A%B5_%EC%BD%94%EB%93%9C(%EA%B9%80%EC%97%B0%ED%9B%88).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:

In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
from google.colab import drive
import torch
import numpy as np
import pickle
import pandas as pd

In [None]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 모델과 데이터셋 설정
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
# 데이터셋 로드 (Huggingface의 CNN/DailyMail 데이터셋 사용)
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
# 데이터셋의 article 항목 최대 길이 및 90% 표준편차 길이 확인
article_lengths = [len(article) for article in dataset["train"]["article"]]
max_article_length = max(article_lengths)
length_90_percentile = int(np.percentile(article_lengths, 95))
print(f"Maximum article length in dataset: {max_article_length} characters")
print(f"90% percentile article length in dataset: {length_90_percentile} characters")

In [None]:
# processed_dataset_path = "/content/drive/MyDrive/Colab Notebooks/Pegasus_요약모델/tokenized_datasets.pkl"

# # 데이터 전처리 함수, 토크나이징, 패딩
# def preprocess_function(examples):
#     inputs = examples["article"]
#     model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

#     # 레이블 설정
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [None]:
processed_dataset_path = "/content/drive/MyDrive/summarizer/preprcessed/preprocessed_dataset.pkl"
data_folder_path = "/content/drive/MyDrive/summarizer/data"

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def preprocess_function(examples):
    inputs = examples["article"]
    model_inputs = tokenizer(inputs, max_length=8000, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # 레이블 설정
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # 레이블에서 패딩 토큰을 -100으로 변경하여 로스 계산에서 무시하도록 설정
    if isinstance(labels["input_ids"], torch.Tensor):
        labels["input_ids"] = labels["input_ids"].masked_fill(labels["input_ids"] == tokenizer.pad_token_id, -100)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
dataframes = []
for filename in os.listdir(data_folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(data_folder_path, filename)
        df = pd.read_csv(file_path, encoding='latin1')

        # 특정 열 이름만 바꾸기
        if filename == "news_summary.csv":
            df = df.rename(columns={
                "ctext": "article",  # 'ctext'를 'article'로 변경
                "text": "highlights"  # 'text'를 'highlights'로 변경
            })

        elif filename == "news_summary_more.csv":
            df = df.rename(columns={
                "headlines": "highlights",  # 'headlines'를 'highlights'로 변경
                "text": "article"  # 'text'를 'article'로 변경
            })

        # 변경된 파일 다시 저장
        df.to_csv(file_path, index=False)

        # 데이터프레임 리스트에 추가
        dataframes.append(df)

In [None]:
if os.path.exists(processed_dataset_path):
    try:
        print("Loading preprocessed dataset from file...")
        with open(processed_dataset_path, "rb") as f:
            tokenized_datasets = pickle.load(f)
        print("Preprocessed dataset loaded successfully.")
        start_index = len(tokenized_datasets)
    except (pickle.UnpicklingError, EOFError) as e:
        print("Error loading preprocessed dataset. Starting from scratch...")
        tokenized_datasets = []
        start_index = 110000
else:
    print("No preprocessed dataset found. Starting from scratch...")
    tokenized_datasets = []
    start_index = 110000

print("Preprocessing dataset from index {}...".format(start_index))
for index in range(start_index, len(dataset["train"])):
    tokenized_data = preprocess_function(dataset["train"][index])
    tokenized_datasets.append(tokenized_data)
    # 중간중간 저장하여 RAM 용량 절약
    if (index + 1) % 10000 == 0:  # 매 10000개씩 기존 파일에 추가 저장
        with open(processed_dataset_path, "wb") as f:
            pickle.dump(tokenized_datasets, f)
        print(f"Saved {index + 1} tokenized examples so far...")

# 최종 저장
with open(processed_dataset_path, "wb") as f:
    pickle.dump(tokenized_datasets, f)
print("Tokenized dataset saved successfully.")

In [12]:
# if os.path.exists(processed_dataset_path):
#     print("Loading preprocessed dataset from file...")
#     with open(processed_dataset_path, "rb") as f:
#         tokenized_datasets = pickle.load(f)
#     print("Preprocessed dataset loaded successfully.")
# else:
#     print("Preprocessing dataset...")
#     tokenized_datasets = []
#     for index, row in dataset.iterrows():
#         tokenized_data = preprocess_function(row)
#         tokenized_datasets.append(tokenized_data)
#         # 중간중간 저장하여 RAM 용량 절약
#         if (index + 1) % 1000 == 0:  # 매 1000개씩 저장
#             with open(processed_dataset_path, "wb") as f:
#                 pickle.dump(tokenized_datasets, f)
#             print(f"Saved {index + 1} tokenized examples so far...")

#     # 최종 저장
#     with open(processed_dataset_path, "wb") as f:
#         pickle.dump(tokenized_datasets, f)
#     print("Tokenized dataset saved successfully.")

Preprocessing dataset...


AttributeError: 'DatasetDict' object has no attribute 'iterrows'

In [None]:
# 데이터셋 전처리
tokenized_datasets = dataset.map(preprocess_function, batched=True)

#학습 및 저장


In [None]:
model_checkpoint_path = "/content/drive/MyDrive/summarizer/checkpoints"

In [None]:
# # 학습 설정
# training_args = TrainingArguments(
#     output_dir="/content/drive/MyDrive/Colab Notebooks/Pegasus_요약모델/results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,  # 배치 크기 감소
#     per_device_eval_batch_size=2,  # 배치 크기 감소
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="/content/drive/MyDrive/Colab Notebooks/Pegasus_요약모델/logs",
#     logging_steps=10,
#     save_total_limit=2,
# )

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/Pegasus_요약모델/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # 배치 크기 감소
    per_device_eval_batch_size=2,  # 배치 크기 감소
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/Pegasus_요약모델/logs",
    logging_steps=10,
    save_total_limit=2,
)

In [None]:
# 트레이너 설정
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"] if "validation" in tokenized_datasets else None,
#     tokenizer=tokenizer,
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"] if "validation" in tokenized_datasets else None,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
final_model_path = "/content/drive/MyDrive/summarizer/final_model"
trainer.save_model(final_model_path)
print("Final model saved successfully.")

In [None]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(final_model_path)
print("Final model loaded successfully.")

#테스트 요약

In [None]:
# 학습 시작 및 요약 결과 확인
def summarize_article(article_text):
    inputs = tokenizer(article_text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = model.generate(**inputs, max_length=60, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
# 사용자 입력 문장을 요약
article_text = "여기다가 입력"
summary = summarize_article(article_text)
print("\nSummary:")
print(summary)