In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

In [5]:
#data_raw = pd.read_csv('__file_location__/data_equal_uni.csv', encoding='cp949')

In [6]:
data_raw.drop(columns = ['Unnamed: 0'], inplace=True)

In [7]:
U_TKN = '<usr>'
S_TKN = '<sys>'
BOS = '</s>'
EOS = '</s>'
MASK = '<unused0>'
SENT1 = '<unused1>'
SENT2 = '<unused2>'
PAD = '<pad>'

In [8]:
Tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK) 
model_kogpt2 = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
class mk_Dataset(Dataset):
  def __init__(self, chats, max_len = 64):
    self._data = chats
    self.first = True
    self.q_token = U_TKN
    self.a_token = S_TKN
    self.sent_token_mc = SENT1
    self.sent_token_sub = SENT2
    self.bos = BOS
    self.eos = EOS
    self.mask = MASK
    self.pad = PAD
    self.max_len = max_len
    self.tokenizer = Tokenizer

    question = self._data.loc[:,'usr'].values
    answer = self._data.loc[:,'sys'].values
    sentiment_m = self._data.loc[:,'mc_enco']
    sentiment_s = self._data.loc[:,'sub_enco']

    self.input_ids = []
    self.attention_mask = []
    self.token_type_ids = []
    self.labels = []

    for q, a, sm, ss in zip(question, answer, sentiment_m, sentiment_s):
      q_t = self.tokenizer.tokenize(self.q_token + q + self.sent_token_mc + str(sm) + self.sent_token_sub + str(ss))
      q_len = len(q_t)
      a_t = self.tokenizer.tokenize(self.a_token + a + self.eos)
      a_len = len(a_t)

      if q_len + a_len > self.max_len:
        a_len = self.max_len - q_len
        if a_len <= 0:
          q_t = q_t[-(int(self.max_len/2)):]
          q_len = len(q_t)
          a_len = self.max_len - q_len
          assert a_len > 0
      a_t = a_t[:a_len]
      a_len = len(a_t)

      input_id = self.tokenizer.convert_tokens_to_ids(q_t + a_t)
      label = input_id
      token_type_id = [0] * len(q_t) + [1] * (self.max_len - len(q_t))
      attention = [1] * len(input_id) + [0] * (self.max_len - len(input_id))

      while len(input_id) < self.max_len:
        input_id += [self.tokenizer.pad_token_id]
      while len(label) < self.max_len:
        label += [self.tokenizer.pad_token_id]
      
      self.input_ids.append(input_id)
      self.labels.append(label)
      self.token_type_ids.append(token_type_id)
      self.attention_mask.append(attention)

  def __getitem__(self, index):
    input_ids_index = self.input_ids[index]
    attention_mask_index = self.token_type_ids[index]
    token_type_ids_index = self.attention_mask[index]
    labels_index = self.labels[index]

    return {'input_ids':torch.tensor(input_ids_index).to(device), 
            'token_type_ids': torch.tensor(attention_mask_index).to(device), 
            'attention_mask': torch.tensor(token_type_ids_index).to(device), 
            'labels': torch.tensor(labels_index).to(device)}

  def __len__(self):
    return len(self._data)

In [12]:
train_set = mk_Dataset(data_raw, max_len=64)
train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)

In [13]:
model_kogpt2.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [14]:
optimizer = torch.optim.Adam(model_kogpt2.parameters(), lr=0.0001)

In [15]:
model_kogpt2(**iter(train_dataloader).next()).loss

tensor(10.4621, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
model_kogpt2.train()
for epoch in tqdm(range(10)):
  for data in train_dataloader:
    optimizer.zero_grad()
    loss = model_kogpt2(**data).loss
    loss.backward()
    optimizer.step()
  print(loss)

try:
  torch.save({
      'epochs' : epoch,
      'model_state_dict' : model_kogpt2.state_dict(),
      'optimizer_state_dict' : optimizer.state_dict(),
      'loss' : loss
}, '__file_location__/220525_unified_equal_gpt_model.pt')
except:
  torch.save(model_kogpt2, '__file_location__/220525_unified_equal_gpt_model_full.pt')

 10%|█         | 1/10 [08:30<1:16:33, 510.37s/it]

tensor(1.3060, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|██        | 2/10 [17:01<1:08:04, 510.58s/it]

tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward0>)


 30%|███       | 3/10 [25:31<59:34, 510.67s/it]  

tensor(1.0107, device='cuda:0', grad_fn=<NllLossBackward0>)


 40%|████      | 4/10 [34:02<51:04, 510.68s/it]

tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward0>)


 50%|█████     | 5/10 [42:33<42:33, 510.70s/it]

tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward0>)


 60%|██████    | 6/10 [51:03<34:02, 510.69s/it]

tensor(0.5705, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 7/10 [59:34<25:32, 510.71s/it]

tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 8/10 [1:08:05<17:01, 510.77s/it]

tensor(0.5052, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 9/10 [1:16:36<08:30, 510.76s/it]

tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 10/10 [1:25:07<00:00, 510.73s/it]

tensor(0.4323, device='cuda:0', grad_fn=<NllLossBackward0>)



