In [1]:
import json
import pandas as pd
from datasets import Dataset

def load_data(file_path):
    """
    JSON 데이터를 불러와 report와 summary만 Pandas DataFrame으로 변환
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    
    # 'report'와 'summary'만 추출
    data = []
    for item in json_data:
        if 'article' in item and 'summary' in item:
            data.append({
                'article': item['article'],
                'summary': item['summary']
            })
    
    # DataFrame 생성
    df = pd.DataFrame(data)
    print(df.info())
    # None 값을 빈 문자열로 대체
    df = df.fillna('')
    return df

# JSON 파일 경로
file_path = "/home/yjtech2/Desktop/yurim/LLM/Data/summary/train/odor_complaints_dataset.json"

# 데이터 불러오기
df = load_data(file_path)

# 데이터셋 변환 및 분할
data_dict = df.to_dict(orient='list')
train_dataset = Dataset.from_dict(data_dict)
train_dataset = train_dataset.select(range(len(train_dataset) - 100000, len(train_dataset)))
print(len(train_dataset))
valid_dataset = Dataset.from_dict({"err_sentence": [], "cor_sentence": []})

  from .autonotebook import tqdm as notebook_tqdm


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840000 entries, 0 to 839999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   article  840000 non-null  object
 1   summary  840000 non-null  object
dtypes: object(2)
memory usage: 12.8+ MB
None
100000


In [2]:
import os
import torch
from tqdm import tqdm
from typing import Dict
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    AdamW,
    DataCollatorForSeq2Seq,
)

2024-11-22 13:33:53.358088: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-22 13:33:53.383315: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AdamW, DataCollatorForSeq2Seq
import os
import torch
from tqdm import tqdm

class KoBARTSummarizationTrainer:
    def __init__(self, model_name: str, **kwargs):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)  # force_download 제거
        self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=kwargs.get("learning_rate", 2e-5))
        self.max_length = kwargs.get("max_length", 512)
        self.training_args = kwargs
        self.save_dir = kwargs.get("save_dir", "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/summary/kobart/kobart_model")

        self.best_model_path = os.path.join(self.save_dir, "pytorch_model.bin")
        self.tokenizer_path = self.save_dir

        os.makedirs(self.save_dir, exist_ok=True)

    def save_model_and_tokenizer(self):
        """모델과 토크나이저 저장"""
        self.model.save_pretrained(self.save_dir)
        self.tokenizer.save_pretrained(self.save_dir)
        print(f"Model and tokenizer saved at {self.save_dir}")

    def preprocess_data(self, examples: Dict) -> Dict:
        """데이터 전처리 함수"""
        inputs = [f"요약: {self._normalize_text(text)}" for text in examples["article"]]
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.max_length,
            padding="longest",
            truncation=True,
            return_tensors="pt",
        )

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                examples["summary"],
                max_length=self.max_length,
                padding="longest",
                truncation=True,
                return_tensors="pt",
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def train(self, train_dataset, valid_dataset=None):
        train_dataset = train_dataset.map(
            self.preprocess_data,
            batched=True,
            remove_columns=train_dataset.column_names,
        )

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding="longest",
            return_tensors="pt"
        )

        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.training_args["batch_size"],
            shuffle=True,
            collate_fn=data_collator
        )

        self.model.train()
        for epoch in range(self.training_args["num_epochs"]):
            epoch_loss = 0
            self.model.zero_grad()

            for step, batch in enumerate(train_dataloader):
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                )
                loss = outputs.loss
                loss = loss / self.training_args.get("gradient_accumulation_steps", 1)
                loss.backward()

                if (step + 1) % self.training_args.get("gradient_accumulation_steps", 1) == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                epoch_loss += loss.item()

            print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss:.4f}")

        self.save_model_and_tokenizer()

    def predict(self, text: str) -> str:
        inputs = self.tokenizer(
            f"요약: {self._normalize_text(text)}",
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
        ).to(self.device)

        outputs = self.model.generate(
            inputs["input_ids"], max_length=self.max_length, num_beams=5
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _normalize_text(self, text: str) -> str:
        return text.strip()

In [4]:
trainer = KoBARTSummarizationTrainer(
    model_name="gogamza/kobart-summarization",
    max_length=512,
    learning_rate=2e-5,
    batch_size=8,
    num_epochs=5,
    gradient_accumulation_steps=8
)

if __name__ == "__main__":
    torch.cuda.empty_cache()  # GPU 메모리 초기화
    trainer.train(train_dataset, valid_dataset)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Map: 100%|██████████| 100000/100000 [00:04<00:00, 23497.90 examples/s]


Epoch 1 completed. Loss: 100.3176
Epoch 2 completed. Loss: 0.5048
Epoch 3 completed. Loss: 0.6213
Epoch 4 completed. Loss: 0.0541
Epoch 5 completed. Loss: 0.0869
Model and tokenizer saved at /home/yjtech2/Desktop/yurim/LLM/Pre_processing/summary/kobart/kobart_model


In [None]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch
import sys

# 저장된 모델 경로
saved_model_path = "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/summary/kobart/kobart_model"

# 모델과 토크나이저 불러오기
tokenizer = PreTrainedTokenizerFast.from_pretrained(saved_model_path)
model = BartForConditionalGeneration.from_pretrained(saved_model_path).to('cpu')

# 테스트용 텍스트
text = "음식물 쓰레기 냄새가 너무 심하게 나고 있습니다"
inputs = tokenizer(
    f"요약: {text}",
    return_tensors="pt",
    max_length=512,
    truncation=True
).to("cpu")

# 요약 생성
outputs = model.generate(
    inputs["input_ids"],
    max_length=128,  # 요약문 길이 설정
    num_beams=5,
    length_penalty=0.6,
    early_stopping=True
)

# 결과 디코딩 및 출력
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


음식물 쓰레기 냄새 문제


: 