In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
import pandas as pd
from datasets import Dataset

# 데이터 불러오기
url = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv"
data = pd.read_csv(url)

# Q와 A를 합쳐서 하나의 텍스트로 만듦
data['text'] = data['Q'] + " " + data['A']

# 데이터셋을 Dataset 객체로 변환
dataset = Dataset.from_pandas(data[['text']])

In [3]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

model_name = 'microsoft/DialoGPT-small'
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

# 패딩 토큰 설정
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# 데이터셋을 토크나이즈
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# PyTorch 형식으로 변환
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])


Map:   0%|          | 0/11823 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("input_ids")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()


Step,Training Loss
500,1.1416
1000,0.8274
1500,0.7559
2000,0.7178
2500,0.6959
3000,0.675


In [None]:
# 모델 저장
model.save_pretrained('./fine_tuned_dialoGPT')
tokenizer.save_pretrained('./fine_tuned_dialoGPT')

# 텍스트 생성
from transformers import pipeline

generator = pipeline('text-generation', model='./fine_tuned_dialoGPT', tokenizer=tokenizer)
output = generator("안녕하세요, 오늘 기분은 어때요?", max_length=50, num_return_sequences=1)

print(output)
