In [None]:
!pip install transformers

In [11]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import  BertTokenizer

class BERTDataset(Dataset):
    def __init__(self, corpus, label, max_len=300):
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base")
        self.vocab_size = self.tokenizer.vocab_size
        self.sentences = [self.transform(i) for i in corpus]
        self.labels = [np.array(i) for i in label]

    def transform(self, data):
        data = self.tokenizer(data, max_length=self.max_len, padding="max_length", truncation=True,)
        return np.array(data['input_ids']), np.array(data['token_type_ids']), np.array(data['attention_mask'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sentences[idx] + (self.labels[idx],)

In [7]:
import torch
import torch.nn as nn
from transformers import BertModel


class BERTClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_classes=8, dr_rate=0.5):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("beomi/kcbert-base", return_dict=False)
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooler = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        out = self.dropout(pooler)
        return  self.classifier(out)

In [8]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

max_len = 512
batch_size = 4
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
learning_rate = 5e-5

df_token = pd.read_csv('/content/drive/MyDrive/NLP/Datasets/train_category.csv')
corpus = [t for t in df_token['text']]
label = to_categorical(LabelEncoder().fit_transform(df_token['label']))

train_dataset = BERTDataset(corpus, label)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, shuffle=True)

In [10]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

model = BERTClassifier()
model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
losses = []
for epoch in range(num_epochs):
    model.train()
    for batch_id, (input_ids, token_type_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
        input_ids = input_ids.long().to(device)
        token_type_ids = token_type_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.to(device)

        optimizer.zero_grad()
        out = model(input_ids, token_type_ids, attention_mask)
        loss = loss_fn(out, label)
        loss.backward()
        losses.append(loss.item())

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
    print(sum(losses)/len(losses))
    state = {'Epoch': epoch,
             'State_dict': model.state_dict(),
             'Optimizer': optimizer.state_dict()}

100%|██████████| 549/549 [02:26<00:00,  3.76it/s]


0.989671998102564


100%|██████████| 549/549 [02:26<00:00,  3.75it/s]


0.7694407114602171


100%|██████████| 549/549 [02:26<00:00,  3.74it/s]


0.6092418337632541


100%|██████████| 549/549 [02:26<00:00,  3.74it/s]


0.4936553196476235


100%|██████████| 549/549 [02:26<00:00,  3.74it/s]

0.40949471909934504





In [19]:
torch.cuda.empty_cache()
model.eval()
acc = []
for batch_id, (input_ids, token_type_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
    input_ids = input_ids.long().to(device)
    token_type_ids = token_type_ids.long().to(device)
    attention_mask = attention_mask.long().to(device)
    label = label.to(device)

    out = model(input_ids, token_type_ids, attention_mask)
    acc.append(out)

  0%|          | 0/549 [00:00<?, ?it/s]


RuntimeError: ignored

In [17]:
print(acc)

[tensor([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False]],
       device='cuda:0'), tensor([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False]],
       device='cuda:0'), tensor([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False]],
       device='cuda:0'), tensor([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False