In [1]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-1u25rt5s
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-1u25rt5s


In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm, tqdm_notebook
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import pandas as pd
import os
import sys

# from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'torch'

In [3]:
sys.path.append("/content/drive/My Drive/IIPL/typo_aug/")
from tokenization_kobert import KoBertTokenizer


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
CFG = {
    "DEVICE" : torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu'),
    "BATCH_SIZE" : 16,
    "MAX_LENGTH" : 32,
    "NUM_EPOCHS" : 5,
    "LEARNING_RATE" : 3e-6,
    "WARMUP_RATIO" : 0.1,
    "MAX_GRAD_NORM" : 1,
    "LOG_INTERVAL" : 200,
    "PATH" : os.getcwd()
}

In [6]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일


In [7]:
tokenizer

PreTrainedTokenizer(name_or_path='monologg/kobert', vocab_size=8002, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
df_train = pd.read_csv(CFG["PATH"]+"ratings_train.txt", sep='\t')
df_test = pd.read_csv(CFG["PATH"]+"ratings_test.txt", sep='\t')
df_train.drop(columns=["id"])
df_test.drop(columns=["id"])

Unnamed: 0,document,label
0,굳 ㅋ,1
1,GDNTOPCLASSINTHECLUB,0
2,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
...,...,...
49995,오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함,1
49996,의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따 OOOO,0
49997,그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다,0
49998,절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네,0


In [9]:
df_train = df_train.dropna(axis=0).reset_index(drop=True)
df_test = df_test.dropna(axis=0).reset_index(drop=True)

In [11]:
df_train["tokenized"] = df_train["document"].map(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=CFG["MAX_LENGTH"]))
df_test["tokenized"] = df_test["document"].map(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=CFG["MAX_LENGTH"]))


In [14]:
class BERTDataset(Dataset):
  def __init__(self, dataset, label, max_length=CFG["MAX_LENGTH"]):
    self.dataset = dataset
    self.label = label
    self.max_length = max_length
  def __getitem__(self,idx):
    return (np.array(self.dataset[idx]["input_ids"]),
            self.max_length, 
            np.array(self.dataset[idx]["token_type_ids"]), 
            self.label[idx])

  def __len__(self):
    return len(self.label)

In [16]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2, 
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, 
                              token_type_ids = segment_ids.long(), 
                              attention_mask = attention_mask.float().to(token_ids.device),
                              return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [17]:
train_dataset = BERTDataset(df_train["tokenized"], df_train['label'])
test_dataset = BERTDataset(df_test["tokenized"], df_test['label'])

In [18]:
train_dataloader = DataLoader(train_dataset, batch_size = CFG["BATCH_SIZE"], num_workers=0, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = CFG["BATCH_SIZE"], num_workers=0,shuffle=True)

In [19]:
bert_model = BertModel.from_pretrained('monologg/kobert')
model = BERTClassifier(bert_model, dr_rate=0.5).to(CFG["DEVICE"])
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=CFG["LEARNING_RATE"])
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * CFG["NUM_EPOCHS"]
warmup_step = int(t_total * CFG["WARMUP_RATIO"])

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [20]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [21]:
for e in range(CFG["NUM_EPOCHS"]):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.to(CFG["DEVICE"])
        segment_ids = segment_ids.to(CFG["DEVICE"])
        valid_length = valid_length
        label = label.to(CFG["DEVICE"])
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG["MAX_GRAD_NORM"])
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.to(CFG["DEVICE"])
        segment_ids = segment_ids.to(CFG["DEVICE"])
        valid_length= valid_length
        label = label.to(CFG["DEVICE"])
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/9375 [00:00<?, ?it/s]

epoch 1 train acc 0.7577909090909091


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/3125 [00:00<?, ?it/s]

epoch 1 test acc 0.86106


  0%|          | 0/9375 [00:00<?, ?it/s]

epoch 2 train acc 0.8714775757575758


  0%|          | 0/3125 [00:00<?, ?it/s]

epoch 2 test acc 0.87558


  0%|          | 0/9375 [00:00<?, ?it/s]

epoch 3 train acc 0.8931133333333333


  0%|          | 0/3125 [00:00<?, ?it/s]

epoch 3 test acc 0.8808753846153847


  0%|          | 0/9375 [00:00<?, ?it/s]

epoch 4 train acc 0.9058206060606061


  0%|          | 0/3125 [00:00<?, ?it/s]

epoch 4 test acc 0.8826661538461539


  0%|          | 0/9375 [00:00<?, ?it/s]

epoch 5 train acc 0.9114139393939394


  0%|          | 0/3125 [00:00<?, ?it/s]

epoch 5 test acc 0.8839953846153846


In [22]:
tokenizer.decode(next(iter(train_dataloader))[0][1])

'[CLS]...1점짜리영화...이런졸작은처음...[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'