In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 17.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 15.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import os
import re
from tqdm.notebook import tqdm

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
dataset = pd.read_csv('/content/dialog_Training.csv', encoding='cp949')
dataset.columns


  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['번호', '연령', '성별', '상황키워드', '신체질환', '감정_대분류', '감정_소분류', '사람문장1',
       '시스템응답1', '사람문장2', '시스템응답2', '사람문장3', '시스템응답3', '사람문장4', '시스템응답4'],
      dtype='object')

In [None]:
q1 = dataset.loc[:,'사람문장1']
q2 = dataset.loc[:,'사람문장2']
q3 = dataset.loc[:,'사람문장3']

a1 = dataset.loc[:,'시스템응답1']
a2 = dataset.loc[:,'시스템응답2']
a3 = dataset.loc[:,'시스템응답3']

In [None]:
question = pd.concat([q1,q2,q3], axis = 0)
answer = pd.concat([a1,a2,a3], axis = 0)

In [None]:
print(len(question))
print(len(answer))

112817
112817


In [None]:
dataset = pd.DataFrame({'Q' : list(question), 'A': list(answer)})

In [None]:
dataset.tail()

Unnamed: 0,Q,A
112812,최 영감에게 기분 좋은 말을 해줘야 할 것 같아.,최 영감에게 기분 좋은 말을 함으로써 잘 해결이 되었으면 좋겠어요.
112813,이대로 내가 키우게 되면 안 되니까 확실하게 해야겠어.,그렇게 하셨을 떄 지금의 상황이 어떻게 되기를 바라시나요?
112814,내가 다시 내 꿈을 어떻게 이룰 것인지 자세히 설명해 드려야겠어.,꿈을 어떻게 이룰 것인지 엄마에게 이야기해보려 하시는군요.
112815,운동으로 뭉친 근육을 풀어주는 것 같아. 그럼 덜 피로하겠지.,안마기로 피로가 많이 풀리시길 바라요.
112816,직접 서운한 감정을 친구에게 얘기하려고 해.,이번에도 대화를 통해 불편한 마음을 풀려고 하시는군요.


In [None]:
dataset.dropna(inplace=True)

In [None]:
len(dataset)

112817

In [None]:
Q_TKN, A_TKN, BOS, EOS, MASK, SENT, PAD = "<usr>", "<sys>", '</s>', '</s>', '<unused0>', '<unused1>', '<pad>'

class kogpt_dataset(Dataset):
    def __init__(self, chats, max_len = 40):
        self.data = chats
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.eos = EOS
        self.mask = MASK

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        dialog_data = self.data.iloc[idx]
        q = dialog_data['Q']
        q = re.sub(r"([?.!,])", r"", q)

        a = dialog_data['A']
        a = re.sub(r"([?.!,])", r"", a)

        q_token = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_token)

        a_token = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_token)

        if q_len > self.max_len:
            a_len = self.max_len - q_len
            if a_len <= 0:
                q_token = q_token[-(int(self.max_len/2)):]
                q_len = len(q_token)
                a_len = self.max_len - q_len
            a_token = a_token[:a_len]
            a_len = len(a_token)
        # 질문의 길이가 너무 클 경우 질문을 줄이고
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len
            if a_len <= 0:
                q_token = q_token[-(int(self.max_len/2)):]
                q_len = len(q_token)
                a_len = self.max_len - q_len
            a_token = a_token[:a_len]
            a_len = len(a_token)

        # 질문은 이미 전체 길이의 절반이 되었기 때문에 다시 정답이 줄 일이 없다
        # 정답의 길이가 너무 클 경우는 정답을 줄인다
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len
            if a_len <= 0:
                q_token = q_token[-(int(self.max_len / 2)):]
                q_len = len(q_token)
                a_len = self.max_len - q_len
            a_token[:a_len]

        labels = [self.mask] * q_len + a_token[1:]
        mask = [0] * q_len + [1] * a_len + [0]*(self.max_len - q_len - a_len)

        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        token_ids = self.tokenizer.convert_tokens_to_ids(q_token + a_token)
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]
        return (token_ids, np.array(mask), labels_ids)

#torch Tensor로 변환시켜주는 함수
def collate_batch(batch):
    data = [item[0] for item in batch]
    mask = [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

train_set = kogpt_dataset(dataset, max_len=40)
train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_batch)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
type(iter(train_dataloader).next()[0])

torch.Tensor

In [None]:
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
learning_rate = 0.0001
Sneg = -1e18
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
print ("start")
for epoch in range(7):
    for batch_idx, samples in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids, mask, label = samples[0].to(device), samples[1].to(device), samples[2].to(device) 
        out = model(token_ids)
        out = out.logits.to(device)      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        # 학습 끝
        optimizer.step()
print ("end")

start


  0%|          | 0/3526 [00:00<?, ?it/s]

  0%|          | 0/3526 [00:00<?, ?it/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/'

In [None]:
torch.save(model.state_dict(), path + 'kogpt_state1.pt')

In [None]:
torch.save(model, path+'kogpt_model1.pt')

In [None]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.load_state_dict(torch.load(path + 'kogpt_state.pt'))

<All keys matched successfully>

In [None]:
with torch.no_grad():
    while 1:
        q = input("user > ").strip()
        if q == "quit":
            break
        a = ""
        while 1:
            input_ids = torch.LongTensor(tokenizer.encode(Q_TKN + q + SENT +  A_TKN + a)).unsqueeze(dim=0)
            pred = model(input_ids)
            pred = pred.logits
            gen = tokenizer.convert_ids_to_tokens(torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace("▁", " ")
        print("Chatbot > {}".format(a.strip()))

user > 졸려
Chatbot > 그렇군요 그 이유가 있나요
user > 파이썬
Chatbot > 무슨 일인가요
user > 탬탬버린
Chatbot > 기분이 좋으시겠어요 그 무엇 때문에 그렇게 생각하시나요
user > 술 마셨오ㅓ
Chatbot > 술 마셨군요 무슨 일이 있으신가요
user > 너는 몇살이야?
Chatbot > 무슨 일 있으신가요
user > 막국수
Chatbot > 막막하신가 보네요 무슨 일 있으신가요
user > 안녕?
Chatbot > 어떤 일이 있었나요
user > 안녕하세요
Chatbot > 어떤 일이 있었나요
user > 행복해요
Chatbot > 행복하시겠어요 그 이유가 무엇인가요
user > 승진이 너무 힘들어요
Chatbot > 승진이 너무 힘드시군요
user > 만두가 먹고싶어요
Chatbot > 다 그런 일이 있으셨군요
user > 대학교에 가선 뭘 하나요?
Chatbot > 그렇군요 그 이유가 있나요
user > 안경 사야되요
Chatbot > 그렇군요 그 이유가 있나요
user > 로스트아크
Chatbot > 로테스트에서 좋은 성과도 이루고 휴가도 받아서 기쁘시겠어요
user > 취업을 하고 싶어요
Chatbot > 어떤 점이 그렇게 취업하고 싶게 만드나요
user > 게임은 나쁜건가요?
Chatbot > 무슨 좋은 일이 있으신가요
user > 운전을 배워야 해요
Chatbot > 운전을 배운 것이 후회되시나요


KeyboardInterrupt: ignored