### 모델 파인튜닝

In [24]:
!pip install datasets



In [46]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# # Mount Google Drive (for Colab users)
# from google.colab import drive
# drive.mount('/content/drive')

# Load the dataset
file_path = "/content/최종_df.csv"
data = pd.read_csv(file_path)

# Split data into train and validation
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

# Convert to Hugging Face Dataset
def prepare_data(data):
    return Dataset.from_pandas(data[['input', 'output']])

train_dataset = prepare_data(train_data)
val_dataset = prepare_data(val_data)

# Load tokenizer and model
tokenizer = PreTrainedTokenizerFast.from_pretrained("digit82/kobart-summarization")
model = BartForConditionalGeneration.from_pretrained("digit82/kobart-summarization")

# Tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples['output'], max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs['labels'] = labels
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./kobart_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./kobart_logs',
    logging_steps=10,
    save_steps=500,
    warmup_steps=500,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./kobart-summarization-finetuned")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.5828,3.21744
2,0.5406,0.465294
3,0.31,0.296479


### 출력 테스트

In [47]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import re
import json

# 모델과 토크나이저 로드
model_path = "/content/kobart-summarization-finetuned"
tokenizer = PreTrainedTokenizerFast.from_pretrained("digit82/kobart-summarization")
model = BartForConditionalGeneration.from_pretrained(model_path)

def parse_output_to_json(generated_output):
    """
    모델 출력을 JSON 형식으로 변환하고 불필요한 문자를 정제합니다.
    """
    parsed_data = {}
    try:
        # location 추출
        location_match = re.search(r"\[location\]\s*(.+)", generated_output)
        if location_match:
            parsed_data["location"] = location_match.group(1).strip()

        # caption 추출
        caption_match = re.search(r"\[caption\]\s*(.+)", generated_output)
        if caption_match:
            parsed_data["caption"] = caption_match.group(1).strip()

        # dialogues 추출
        dialogues_match = re.search(r"\[dialogues\]\s*(\[.+)", generated_output, re.DOTALL)
        if dialogues_match:
            dialogues_raw = dialogues_match.group(1).strip()
            dialogues = []
            # 대화 목록 추출
            for speaker, dialogue in re.findall(r"\[speaker\]\s*(.+?)\s*\[dialogue\]\s*(.+?)(?=\[speaker\]|\Z)", dialogues_raw, re.DOTALL):
                # 정제 단계: dialogue 값에서 불필요한 패턴 제거
                dialogue = re.sub(r"\s*\n\s*[\[\]]?", "", dialogue).strip()
                dialogues.append({
                    "speaker": speaker.strip(),
                    "dialogue": dialogue
                })
            parsed_data["dialogues"] = dialogues

    except Exception as e:
        parsed_data["error"] = f"Parsing error: {str(e)}"
        parsed_data["content"] = generated_output

    return parsed_data

def slice_and_process_text(input_file_path, slice_num=20000, chunk_size=500):
    """
    텍스트를 지정된 길이로 자르고, JSON으로 변환 후 출력합니다.
    """
    # 텍스트 파일 읽기
    with open(input_file_path, 'r', encoding='utf-8') as f:
        input_txt = f.read()

    # 줄바꿈 제거 및 텍스트 자르기
    input_txt = input_txt.replace('\n', ' ')
    input_txt = input_txt[:slice_num]  # 지정된 크기만큼 자르기

    # 문장 끝 패턴 정의 및 분리
    sentence_end_pattern = re.compile(r'([.!?]["”]?\s)')  # 문장 끝 패턴
    sentences = sentence_end_pattern.split(input_txt)

    # 문장을 청크로 나누기
    chunks = []
    current_chunk = ""

    for i in range(0, len(sentences) - 1, 2):
        sentence = sentences[i] + sentences[i + 1]
        if len(current_chunk) + len(sentence) > chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    # 청크별로 모델을 통해 처리
    for idx, chunk in enumerate(chunks):
        # 모델 입력 생성
        inputs = tokenizer(chunk, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

        # 모델 출력 생성
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=512,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=True,
        )

        # 디코딩
        generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # 모델 출력 파싱 및 정제
        parsed_output = parse_output_to_json(generated_output)

        # JSON 출력
        print(f"청크 {idx + 1} 결과:")
        print(json.dumps(parsed_output, indent=4, ensure_ascii=False))

        # 사용자 확인 후 다음 청크 진행
        user_input = input(f"청크 {idx + 1} 처리를 완료하려면 '완료'를 입력하세요: ")
        if user_input.strip().lower() != "완료":
            print("작업을 중단합니다.")
            break

# 사용 예시
input_path = f"/content/화산귀환 (비가) (Z-Library)_원문.txt"
slice_and_process_text(input_path)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


청크 1 결과:
{
    "location": "산봉우리",
    "caption": "청명은 산봉우리에서 검을 휘두르며 분노를 터뜨린다.",
    "dialogues": [
        {
            "speaker": "청명",
            "dialogue": "이제 이곳에 남은 것은 죽음뿐이다."
        }
    ]
}
청크 1 처리를 완료하려면 '완료'를 입력하세요: 완료
청크 2 결과:
{
    "location": "산봉우리",
    "caption": "청명은 산봉우리에서 죽음을 맞이한다.",
    "dialogues": [
        {
            "speaker": "청명",
            "dialogue": "\"......장문사형\""
        },
        {
            "speaker": "청명",
            "dialogue": "\"......장문사형\""
        }
    ]
}
청크 2 처리를 완료하려면 '완료'를 입력하세요: 끝
작업을 중단합니다.
