### 모델 파인튜닝

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
# Mount Google Drive (for Colab users)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, get_scheduler
from datasets import Dataset
import pandas as pd
import torch

# Load the dataset
file_path = "/content/model1.csv"  # 데이터 경로 수정 필요
data = pd.read_csv(file_path)

# Split data into train and validation sequentially
split_ratio = 0.8
train_size = int(len(data) * split_ratio)

train_data = data.iloc[:train_size]
val_data = data.iloc[train_size:]

# Convert to Hugging Face Dataset
def prepare_data(data):
    return Dataset.from_pandas(data[['input', 'output']])

train_dataset = prepare_data(train_data)
val_dataset = prepare_data(val_data)

# Load tokenizer and model
tokenizer = PreTrainedTokenizerFast.from_pretrained("digit82/kobart-summarization")  # gogamza 모델 사용
model = BartForConditionalGeneration.from_pretrained("digit82/kobart-summarization")

# Tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['input'],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['output'],
        max_length=512,
        truncation=True,
        padding="max_length"
    ).input_ids
    model_inputs['labels'] = labels
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./kobart_results",
    evaluation_strategy="steps",
    eval_steps=500,  # 더 자주 평가하여 학습 상태 모니터링
    save_steps=500,  # 모델 저장 주기
    learning_rate=5e-5,  # 조금 더 큰 학습률 사용
    per_device_train_batch_size=16,  # 배치 크기 줄여 메모리 효율 개선
    per_device_eval_batch_size=16,
    num_train_epochs=30,  # 에포크 수를 데이터셋 크기에 맞게 최적화
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./kobart_logs',
    logging_steps=50,
    warmup_steps=100,  # 워밍업 단계에서 학습률 서서히 증가
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,  # 가장 성능 좋은 모델을 학습 후 로드
    report_to="none"
)

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = (
    len(train_dataset) * training_args.num_train_epochs // training_args.per_device_train_batch_size
)
scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=num_training_steps
)

# Define Early Stopping Callback
from transformers import EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5)  # Early stopping patience 증가

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping_callback]
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("/content/drive/MyDrive/digit82_batch16_epoch30")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Map:   0%|          | 0/839 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.0322,0.42014
1000,0.0071,0.450762
1500,0.0042,0.464906


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


In [5]:
import re
import json
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

# 모델과 토크나이저 로드
model_path = "/content/drive/MyDrive/digit82_batch16_epoch30"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

def preprocess_and_split(text):
    """
    텍스트 전처리 및 청크 분리 함수
    """
    text = text.replace('*', '').replace('◆', '').replace('◇', '')
    text = re.sub(r'\s+', ' ', text)  # 연속된 공백 제거
    text = text.replace('\n', ' ').replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")
    text = re.sub(r'\([^가-힣]*[\u4E00-\u9FFF\u3040-\u30FF]+[^가-힣]*\)', '', text)  # 괄호 안 한자/일본어 제거
    text = re.sub(r'[\u4E00-\u9FFF\u3040-\u30FF]+', '', text)  # 한자/일본어 제거

    chunks = [text[i:i + 10000] for i in range(0, len(text), 10000)]
    result = []

    for chunk in chunks:
        buffer = ""
        is_open_double_quote = False
        is_open_single_quote = False

        for char in chunk:
            buffer += char

            if char == '"':
                is_open_double_quote = not is_open_double_quote
            elif char == "'":
                is_open_single_quote = not is_open_single_quote

            if len(buffer) >= 250:
                if char in ['.', '!', '?'] and not is_open_double_quote and not is_open_single_quote:
                    result.append(buffer.strip())
                    buffer = ""
                elif char in ['"'] and buffer[-3] in ['"']:
                    result.append(buffer[:-1].strip())
                    buffer = '"'
                elif char in ["'"] and buffer[-3] in ["'"]:
                    result.append(buffer[:-1].strip())
                    buffer = "'"
                elif re.search(r'\.\.\.|!!|\?\?', buffer):
                    match = re.search(r'(\.\.\.|!!|\?\?)', buffer)
                    split_index = match.end()
                    result.append(buffer[:split_index].strip())
                    buffer = buffer[split_index:].strip()
    return result

def parse_output_to_json(generated_output):
    """
    모델 출력을 JSON 형식으로 변환
    """
    parsed_data = {}
    try:
        location_match = re.search(r"\[location\]\s*(.+)", generated_output)
        if location_match:
            parsed_data["location"] = location_match.group(1).strip()

        caption_match = re.search(r"\[caption\]\s*(.+)", generated_output)
        if caption_match:
            parsed_data["caption"] = caption_match.group(1).strip()

        dialogues_match = re.search(r"\[dialogues\]\s*(\[.+)", generated_output, re.DOTALL)
        if dialogues_match:
            dialogues_raw = dialogues_match.group(1).strip()
            dialogues = []
            for speaker, dialogue in re.findall(r"\[speaker\]\s*(.+?)\s*\[dialogue\]\s*(.+?)(?=\[speaker\]|\Z)", dialogues_raw, re.DOTALL):
                dialogue = re.sub(r"\s*\n\s*[\[\]]?", "", dialogue).strip()
                dialogues.append({
                    "speaker": speaker.strip(),
                    "dialogue": dialogue
                })
            parsed_data["dialogues"] = dialogues

    except Exception as e:
        parsed_data["error"] = f"Parsing error: {str(e)}"
        parsed_data["content"] = generated_output

    return parsed_data

def process_file(file_path):
    """
    개별 텍스트 파일을 처리하고 결과를 출력
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        split_text = preprocess_and_split(text)

        for idx, chunk in enumerate(split_text):
            print(f"청크 {idx + 1} 처리 중...")

            # 모델 입력 생성
            inputs = tokenizer(chunk, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

            # 모델 출력 생성
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=512,
                num_beams=4,
                length_penalty=1.0,
                early_stopping=True,
            )

            # 디코딩
            generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # 모델 출력 파싱 및 정제
            parsed_output = parse_output_to_json(generated_output)

            # JSON 형식으로 출력
            print(f"청크 {idx + 1} 결과:")
            print(json.dumps(parsed_output, indent=4, ensure_ascii=False))  # JSON 형식으로 출력

            # 사용자 확인 후 다음 청크 진행
            user_input = input(f"청크 {idx + 1} 처리를 완료하려면 '완료'를 입력하세요: ")
            if user_input.strip().lower() != "완료":
                print("작업을 중단합니다.")
                return

# 사용 예시
file_path = r'/content/미로(애장판) 1 (박수정) (Z-Library).txt'
process_file(file_path)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


청크 1 처리 중...
청크 1 결과:
{
    "location": "\"도시, 클럽 근처\"",
    "caption": "\"윤은 누군가 자신을 따라오고 있다는 것을 느끼며, 그녀가 하룻밤 상대였지만, 그녀가 다음 날부터 여자친구처럼 행동하려는 모습에 당황한다.\"",
    "dialogues": [
        {
            "speaker": "\"윤\"",
            "dialogue": "\"미치겠네.\""
        },
        {
            "speaker": "\"\"",
            "dialogue": "\"\""
        }
    ]
}
청크 1 처리를 완료하려면 '완료'를 입력하세요: 완료
청크 2 처리 중...
청크 2 결과:
{
    "location": "\"도시, 회사 근처\"",
    "caption": "\"윤은 지수의 끊임없는 연락과 협박에 시달리며, 지수의 주장에 헛웃음이 나올 정도로 기가 막혀한다.\"",
    "dialogues": [
        {
            "speaker": "\"\"",
            "dialogue": "\"\""
        },
        {
            "speaker": "\"\"",
            "dialogue": "\"\""
        }
    ]
}
청크 2 처리를 완료하려면 '완료'를 입력하세요: 완료
청크 3 처리 중...
청크 3 결과:
{
    "location": "\"도시, 회사 근처\"",
    "caption": "\"윤은 여대생의 일인시위를 막기 위해 빠르게 상황을 처리하려 한다. 그는 전화를 끊고 나서 다시 연락을 하지 않기로 결심한다. 그러나 상대는 여전히 윤을 뒤쫓고 있다.\"",
    "dialogues": [
        {
            "speaker": "\"\"",
            "dialogue": "\