In [1]:
!pip install transformers urllib3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re
import math
import torch
import random
import numpy as np
import pandas as pd
import urllib.request

from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset

from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from transformers.optimization import AdamW

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## **1. 모델 및 토크나이저 정의**

In [4]:
Q_TKN = '<usr>'
A_TKN = '<sys>'

BOS = '</s>'        # 문장의 시작을 나타내는 토큰
EOS = '</s>'        # 문장의 끝을 나타내는 토큰
PAD = '<pad>'       # 입력 길이를 동일하게 하기 위해 사용하는 토큰
UNK = '<unk>'

MASK = '<unused0>'
SENT = '<unused1>'

model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token=UNK, pad_token=PAD, mask_token=MASK)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


## **2. 데이터셋 다운로드**

In [5]:
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv",
    filename="ChatBotData.csv",
)

Chatbot_Data = pd.read_csv("ChatBotData.csv")

Chatbot_Data = Chatbot_Data[:500] # 500개 데이터만 사용
Chatbot_Data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## **3. 데이터셋 클래스 생성**

In [6]:
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=40):
        self.data = chats
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.bos = BOS
        self.eos = EOS
        self.pad = PAD
        self.mask = MASK
        self.max_len = max_len
        self.tokenizer = tokenizer 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        turn = self.data.iloc[idx]

        q = turn['Q']                    # 질문 가져오기
        q = re.sub(r'([?.!,])', r'', q)  # 구두점 제거

        a = turn['A']                    # 답변 가져오기
        a = re.sub(r'([?.!,])', r'', a)  # 구두점 제거

        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)

        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len

            if a_len <= 0:
                q_toked = q_toked[-(int(self.max_len/2)):] 
                q_len = len(q_toked)

                a_len = self.max_len - q_len

            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 답변 labels = [mask, mask, ...., mask, ..., <bos>,..답변.. <eos>, <pad>....]
        labels = [self.mask,] * q_len + a_toked[1:]

        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)

        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)

        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)

        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        return (token_ids, np.array(mask), labels_ids)

In [7]:
def collate_fn(batch):
  data = [item[0] for item in batch]
  mask = [item[1] for item in batch]
  label = [item[2] for item in batch]
      
  return torch.tensor(np.array(data)).to(device), torch.tensor(np.array(mask)).to(device), torch.tensor(np.array(label)).to(device)

In [8]:
train_dataset = ChatbotDataset(Chatbot_Data)
train_dataloader = DataLoader(train_dataset, batch_size=16, num_workers=0, shuffle=True, collate_fn=collate_fn)

## **4. 모델 학습**

In [9]:
epochs = 10
Sneg = -1e18
learning_rate = 3e-5

criterion = torch.nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
print ("start")

model.to(device)
model.train()

for epoch in tqdm(range(epochs)):
    for batch_idx, samples in enumerate(train_dataloader):
        optimizer.zero_grad()

        token_ids, mask, label = samples

        out = model(token_ids)
        out = out.logits

        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))

        loss = criterion(mask_out.transpose(2, 1), label)

        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()

        optimizer.step()

print ("end")

start


100%|██████████| 10/10 [01:07<00:00,  6.71s/it]

end





## **5. 챗봇 테스트**

In [11]:
with torch.no_grad():
    while 1:
        q = input("user > ").strip()

        if q == "quit":
            break

        a = ""

        while 1:
            input_ids = torch.LongTensor(tokenizer.encode(Q_TKN + q + SENT + '0' + A_TKN + a)).unsqueeze(dim=0).to(device)

            pred = model(input_ids)
            pred = pred.logits

            gen = tokenizer.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().cpu().numpy().tolist())[-1]

            if gen == EOS:
                break

            a += gen.replace("▁", " ")

        print("Chatbot > {}".format(a.strip()))

user > 새 옷을 샀어.
Chatbot > 새 옷을 사는 게 마음 편해요
user > 여름이 온 것 같아.
Chatbot > 더 좋은 날이 되길 바라요
user > 비가 와.
Chatbot > 마음이 아픈가요
user > 카페 갈까?
Chatbot > 다른 곳으로 이사갈 수 있을 거예요
user > 벌써 밤 10시야.
Chatbot > 하루가 또 가네요
user > quit
