In [1]:
import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import re
from tqdm import tqdm
import pandas as pd
import torch
from transformers import GPT2LMHeadModel

device = torch.device("cuda:1")

In [2]:
Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

In [3]:
koGPT2_TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK) 
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
Chatbot_Data = pd.read_csv("../Data_preprocessing/custom_chatbotdataset(Training).csv")
Chatbot_Data_validation = pd.read_csv("../Data_preprocessing/custom_chatbotdataset(Validation).csv")

In [5]:
Chatbot_Data.shape

(145954, 3)

In [6]:
Chatbot_Data.head()

Unnamed: 0,label,Q,A
0,9,일은 왜 해도 해도 끝이 없을까? 화가 난다.,많이 힘드시겠어요. 주위에 의논할 상대가 있나요?
1,9,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나.,급여가 줄어 속상하시겠어요. 월급이 줄어든 것을 어떻게 보완하실 건가요?
2,9,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...,회사 동료 때문에 스트레스를 많이 받는 것 같아요. 문제 해결을 위해 어떤 노력을 ...
3,9,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...,관련 없는 심부름을 모두 하게 되어서 노여우시군요. 어떤 것이 상황을 나아질 수 있...
4,9,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나.,무시하는 것 같은 태도에 화가 나셨군요. 상대방의 어떤 행동이 그런 감정을 유발하는...


In [7]:
Chatbot_Data['label'].nunique()

58

In [8]:
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=40):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.eos = EOS
        self.mask = MASK
        self.tokenizer = koGPT2_TOKENIZER

    def __len__(self):  # chatbotdata 의 길이를 리턴한다.
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        turn = self._data.iloc[idx]
        
        q = turn["Q"]  # 질문을 가져온다.
        q = re.sub(r"([?.!,])", r" ", q)  # 구둣점들을 제거한다.

        a = turn["A"]  # 답변을 가져온다.
        a = re.sub(r"([?.!,])", r" ", a)  # 구둣점들을 제거한다.

        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)

        #질문의 길이가 최대길이보다 크면
        if q_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        #질문의 길이 + 답변의 길이가 최대길이보다 크면
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len        #답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:       #질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   #질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              #답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 답변 labels = [mask, mask, ...., mask, ..., <bos>,..답변.. <eos>, <pad>....]
        labels = [self.mask,] * q_len + a_toked[1:]

        # mask = 질문길이 0 + 답변길이 1 + 나머지 0
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        # 답변 labels을 index 로 만든다.
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        # 최대길이만큼 PADDING
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        # 질문 + 답변을 index 로 만든다.    
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 PADDING
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        #질문+답변, 마스크, 답변
        return (np.array(token_ids), np.array(mask), np.array(labels_ids))

In [9]:
train_set = ChatbotDataset(Chatbot_Data, max_len=40)
train_dataloader = DataLoader(train_set, batch_size=128, num_workers=4, shuffle=False)

valid_set = ChatbotDataset(Chatbot_Data_validation, max_len=40)
valid_dataloader = DataLoader(valid_set, batch_size=128, num_workers=4, shuffle=False)

In [10]:
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [11]:
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch = 30
Sneg = -1e18

In [12]:
print ("start")
for epoch in range(epoch):
    train_loss = 0.0
    valid_loss = 0.0
    
    model.train()
    for samples in tqdm(train_dataloader): # train
        optimizer.zero_grad()
        token_ids, mask, label = samples
        
        token_ids = token_ids.to(device)
        mask = mask.to(device)
        label = label.to(device)
        
        out = model(token_ids)
        
        out = out.logits      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        # 학습 끝
        optimizer.step()
        train_loss += avg_loss
    train_loss = (train_loss / len(train_dataloader))
    print('Epoch = {}, train loss = {}'.format((epoch+1), train_loss))
        
    with torch.no_grad(): # validation
        model.eval()
        for samples in tqdm(valid_dataloader):
            token_ids, mask, label = samples
            
            token_ids = token_ids.to(device)
            mask = mask.to(device)
            label = label.to(device)
            
            out = model(token_ids)
            
            out = out.logits
            mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
            mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
            loss = criterion(mask_out.transpose(2, 1), label)
            
            avg_loss = loss.sum() / mask.sum()

            valid_loss += avg_loss
        valid_loss = (valid_loss / len(valid_dataloader))
        print('Epoch = {}, validation loss = {}'.format((epoch+1), valid_loss))
            
print ("end")

start


  0%|          | 0/1141 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 1141/1141 [03:12<00:00,  5.93it/s]


Epoch = 1, train loss = 19.440927505493164


100%|██████████| 141/141 [00:08<00:00, 15.67it/s]


Epoch = 1, validation loss = 18.90532684326172


100%|██████████| 1141/1141 [03:12<00:00,  5.93it/s]


Epoch = 2, train loss = 19.251277923583984


100%|██████████| 141/141 [00:09<00:00, 15.64it/s]


Epoch = 2, validation loss = 18.81034278869629


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 3, train loss = 19.152250289916992


100%|██████████| 141/141 [00:09<00:00, 15.64it/s]


Epoch = 3, validation loss = 18.771814346313477


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 4, train loss = 19.06851577758789


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 4, validation loss = 18.746824264526367


100%|██████████| 1141/1141 [03:12<00:00,  5.93it/s]


Epoch = 5, train loss = 18.990047454833984


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 5, validation loss = 18.7320613861084


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 6, train loss = 18.917936325073242


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 6, validation loss = 18.726642608642578


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 7, train loss = 18.848318099975586


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 7, validation loss = 18.727272033691406


100%|██████████| 1141/1141 [03:12<00:00,  5.93it/s]


Epoch = 8, train loss = 18.782865524291992


100%|██████████| 141/141 [00:09<00:00, 15.63it/s]


Epoch = 8, validation loss = 18.733535766601562


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 9, train loss = 18.719282150268555


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 9, validation loss = 18.74243927001953


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 10, train loss = 18.65962028503418


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 10, validation loss = 18.746906280517578


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 11, train loss = 18.60184097290039


100%|██████████| 141/141 [00:09<00:00, 15.59it/s]


Epoch = 11, validation loss = 18.75702476501465


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 12, train loss = 18.546249389648438


100%|██████████| 141/141 [00:09<00:00, 15.60it/s]


Epoch = 12, validation loss = 18.768085479736328


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 13, train loss = 18.492446899414062


100%|██████████| 141/141 [00:09<00:00, 15.60it/s]


Epoch = 13, validation loss = 18.792253494262695


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 14, train loss = 18.441539764404297


100%|██████████| 141/141 [00:09<00:00, 15.63it/s]


Epoch = 14, validation loss = 18.80561637878418


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 15, train loss = 18.39220428466797


100%|██████████| 141/141 [00:09<00:00, 15.64it/s]


Epoch = 15, validation loss = 18.815645217895508


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 16, train loss = 18.345718383789062


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 16, validation loss = 18.830163955688477


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 17, train loss = 18.300098419189453


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 17, validation loss = 18.846508026123047


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 18, train loss = 18.256807327270508


100%|██████████| 141/141 [00:09<00:00, 15.65it/s]


Epoch = 18, validation loss = 18.869688034057617


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 19, train loss = 18.21488380432129


100%|██████████| 141/141 [00:09<00:00, 15.63it/s]


Epoch = 19, validation loss = 18.88942527770996


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 20, train loss = 18.175643920898438


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 20, validation loss = 18.918155670166016


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 21, train loss = 18.138370513916016


100%|██████████| 141/141 [00:09<00:00, 15.60it/s]


Epoch = 21, validation loss = 18.929574966430664


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 22, train loss = 18.102088928222656


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 22, validation loss = 18.946683883666992


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 23, train loss = 18.068113327026367


100%|██████████| 141/141 [00:09<00:00, 15.64it/s]


Epoch = 23, validation loss = 18.97782325744629


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 24, train loss = 18.037250518798828


100%|██████████| 141/141 [00:09<00:00, 15.63it/s]


Epoch = 24, validation loss = 18.99953842163086


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 25, train loss = 18.007205963134766


100%|██████████| 141/141 [00:09<00:00, 15.62it/s]


Epoch = 25, validation loss = 19.02802276611328


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 26, train loss = 17.979259490966797


100%|██████████| 141/141 [00:09<00:00, 15.60it/s]


Epoch = 26, validation loss = 19.044757843017578


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 27, train loss = 17.954402923583984


100%|██████████| 141/141 [00:09<00:00, 15.60it/s]


Epoch = 27, validation loss = 19.0684814453125


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 28, train loss = 17.930761337280273


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 28, validation loss = 19.094003677368164


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 29, train loss = 17.909412384033203


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]


Epoch = 29, validation loss = 19.12160873413086


100%|██████████| 1141/1141 [03:12<00:00,  5.94it/s]


Epoch = 30, train loss = 17.88910484313965


100%|██████████| 141/141 [00:09<00:00, 15.61it/s]

Epoch = 30, validation loss = 19.14961814880371
end





## 챗봇 테스트

In [None]:
# with torch.no_grad():
#     while 1:
#         q = input("user > ").strip()
#         if q == "quit":
#             break
#         a = ""
#         while 1:
#             input_ids = torch.LongTensor(koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + '0' + A_TKN + a)).unsqueeze(dim=0)
#             pred = model(input_ids.to(device))
#             pred = pred.logits
#             gen = koGPT2_TOKENIZER.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().cpu().numpy().tolist())[-1]
#             if gen == EOS:
#                 break
#             a += gen.replace("▁", " ")
#         print("Chatbot > {}".format(a.strip()))