In [82]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import  BertTokenizer

In [83]:
class BERTDataset(Dataset):
    def __init__(self, tokenizer, corpus, labels):
        super(BERTDataset, self).__init__()
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.inputs = [self.tokenize(text) for text in corpus]
        self.labels = [np.array(label) for label in labels]

    def tokenize(self, data):
        max_len = max([len(t) for t in data])
        data = tokenizer(data, max_length=max_len, padding="max_length", truncation=True, return_tensors='pt')
        return data['input_ids'], data['token_type_ids'], data['attention_mask']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [84]:
import torch
import torch.nn as nn
from transformers import BertModel


class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.cls = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooler = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        output = torch.mean(pooler, 0)
        output = self.cls(output)
        return output

In [85]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base")
bert = BertModel.from_pretrained("beomi/kcbert-base", return_dict=False)

max_len = 20
batch_size = 4
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
learning_rate = 5e-5

df_token = pd.read_csv('/home/jone/NLP/Dataset/newszum_train_data.csv')
corpus = df_token['cleanBody'].apply(lambda x: list(filter(None, x.split('.'))))
label = to_categorical(LabelEncoder().fit_transform(df_token['category']))

train_dataset = BERTDataset(tokenizer, corpus, label)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
from transformers.optimization import get_cosine_schedule_with_warmup

model = BERTClassifier(bert, hidden_size=768, num_classes=8)
model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [88]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [89]:
losses = []
for epoch in range(num_epochs):
    model.train()
    for batch_id, (inputs, label) in enumerate(tqdm(train_dataloader, ncols=0)):
        input_ids = inputs[0].long().to(device)
        token_type_ids = inputs[1].long().to(device)
        attention_mask = inputs[2].long().to(device)
        label = label.to(device)

        optimizer.zero_grad()
        out = model(input_ids, token_type_ids, attention_mask)
        loss = loss_fn(out, label)
        loss.backward()
        losses.append(loss.item())

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
    print(sum(losses)/len(losses))
    state = {'Epoch': epoch,
             'State_dict': model.state_dict(),
             'Optimizer': optimizer.state_dict()}

  0% 0/549 [00:00<?, ?it/s]


OSError: [Errno 12] Cannot allocate memory