### 모델 파인튜닝

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# Mount Google Drive (for Colab users)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, get_scheduler
from datasets import Dataset
import pandas as pd
import torch

# Load the dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/시나리오 변환 NLP/action.csv"  # 파일 경로 수정 필요
data = pd.read_excel(file_path)

# 여기서는 데이터를 전체(100%) 학습 데이터로 사용합니다.
train_data = data  # 전체 데이터를 학습 데이터로 사용

# Convert to Hugging Face Dataset (caption, act 열만 사용)
def prepare_data(data):
    return Dataset.from_pandas(data[['caption', 'act']])

train_dataset = prepare_data(train_data)

# Load tokenizer and model
tokenizer = PreTrainedTokenizerFast.from_pretrained("digit82/kobart-summarization")
model = BartForConditionalGeneration.from_pretrained("digit82/kobart-summarization")

# Tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['caption'],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['act'],
        max_length=512,
        truncation=True,
        padding="max_length"
    ).input_ids
    model_inputs['labels'] = labels
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./kobart_results",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./kobart_logs',
    logging_steps=50,
    warmup_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataset) * training_args.num_train_epochs // training_args.per_device_train_batch_size
scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=num_training_steps
)



# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/시나리오 변환 NLP/prompt_model")


In [None]:
import json
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

# 모델과 토크나이저 로드
model_path = "/content/drive/MyDrive/Colab Notebooks/시나리오 변환 NLP/prompt_model"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

def process_input():
    """
    사용자 입력을 받아서 처리하고 결과를 출력
    """
    text = input("텍스트를 입력하세요: ")

    # 모델 입력 생성
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # 모델 출력 생성
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=512,
        num_beams=4,
        length_penalty=1.0,
        early_stopping=True,
    )

    # 디코딩
    generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # JSON 형식으로 출력
    print("결과:")
    print(json.dumps({"output": generated_output}, indent=4, ensure_ascii=False))

# 실행
process_input()

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


텍스트를 입력하세요: 윤은 누군가 자신을 따라오고 있다는 것을 느끼며, 그 상대가 얼마 전 만났던 여대생임을 깨닫는다. 그는 그녀가 하룻밤 상대였지만, 그녀가 다음 날부터 여자친구처럼 행동하려는 모습에 당황한다.
결과:
{
    "output": "윤은 누군가 자신을 따라오고 있다는 것을 느끼며, 그 상대가 얼마 전 만났던 여대생임을 깨닫는다."
}
