In [None]:
!pip install -q torch torchvision transformers kogpt2 pytorch_lightning

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
from kogpt2_transformers import get_kogpt2_tokenizer

In [None]:
# KoGPT 모델 학습 데이터셋 정의
class MyDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

In [None]:
# 학습할 데이터 준비
train_texts = ["안녕하세요, 반갑습니다.", "오늘 날씨가 좋네요.", "저는 한국어 문장 생성을 배우고 있어요."]
val_texts = ["좋은 하루 되세요.", "내일 뭐 할까요?"]

#x 토그나이저, 데이터셋
tokenizer = get_kogpt2_tokenizer()
train_dataset = MyDataset(train_texts, tokenizer, max_len=32)
val_dataset = MyDataset(val_texts, tokenizer, max_len=32)

# 모델 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
# 모델 학습
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            val_loss += loss.item()

        val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.7f} | Val Loss: {val_loss:.7f}")

In [None]:
# 문장 생성 예측 함수 정의
def generate_sentence(prompt, max_length=50, temperature=0.7):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=max_length, temperature=temperature)
    generated_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_sentence

In [None]:
# 예시 문장 생성
prompt = "안녕하세요,"
generated_sentence = generate_sentence(prompt)
print(generated_sentence)