In [58]:
import torch
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    pad_token='<pad>', mask_token='<mask>')
gpt = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [59]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import  BertTokenizer

class GPTDataset(Dataset):
    def __init__(self, tokenizer, corpus, labels, maxlen=128):
        super(GPTDataset, self).__init__()
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.inputs = [self.tokenize(text) for text in corpus]
        self.labels = [np.array(label) for label in labels]

    def tokenize(self, data):
        data = self.tokenizer(data, max_length=self.maxlen, padding="max_length", truncation=True,)
        return np.array(data['input_ids']), np.array(data['token_type_ids']), np.array(data['attention_mask'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [60]:
import torch
import torch.nn as nn
from transformers import BertModel


class GPTClassifier(nn.Module):
    def __init__(self, gpt, hidden_size, max_len, num_classes):
        super(GPTClassifier, self).__init__()
        self.gpt = gpt
        self.classifier = nn.Linear(hidden_size * max_len, num_classes)

    def forward(self, input_ids, attention_mask):
        pooler = gpt(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        output = pooler['hidden_states'][-1]
        batch_size = output.shape[0]
        output = self.classifier(output.reshape(batch_size, -1))
        return output

In [61]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


max_len = 256
batch_size = 4
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
learning_rate = 5e-5

df = pd.read_csv('/home/jone/NLP/Dataset/newszum_train_data.csv')
corpus = [t for t in df['cleanBody']]
label = to_categorical(LabelEncoder().fit_transform(df['category']))

train_dataset = GPTDataset(tokenizer, corpus, label, maxlen=max_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True)

In [62]:
from transformers.optimization import get_cosine_schedule_with_warmup

device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
model = GPTClassifier(gpt, hidden_size=768, max_len=max_len, num_classes=8)
model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [63]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [64]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

losses = []
for epoch in range(num_epochs):
    model.train()
    for batch_id, (inputs, label) in enumerate(tqdm(train_dataloader, ncols=0)):
        input_ids = inputs[0].long().to(device)
        attention_mask = inputs[2].long().to(device)
        label = label.to(device)

        optimizer.zero_grad()
        out = model(input_ids, attention_mask)
        loss = loss_fn(out, label)
        loss.backward()
        losses.append(loss.item())

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
    print(sum(losses)/len(losses))
    state = {'Epoch': epoch,
             'State_dict': model.state_dict(),
             'Optimizer': optimizer.state_dict()}

100% 549/549 [01:31<00:00,  6.02it/s]


1.2504429338115823


100% 549/549 [01:31<00:00,  5.99it/s]


1.1486222235886863


100% 549/549 [01:31<00:00,  6.00it/s]


1.0017489816714613


100% 549/549 [01:31<00:00,  5.99it/s]


0.8379553399255363


100% 549/549 [01:32<00:00,  5.96it/s]


0.7118399206912006


100% 549/549 [01:32<00:00,  5.97it/s]


0.619772318780784


100% 549/549 [01:34<00:00,  5.82it/s]


0.5429411759966972


100% 549/549 [01:32<00:00,  5.96it/s]


0.4838886002486843


100% 549/549 [01:32<00:00,  5.91it/s]


0.4379992100349481


100% 549/549 [01:33<00:00,  5.86it/s]


0.40149266714578713


100% 549/549 [01:31<00:00,  5.98it/s]


0.3692372801550298


100% 549/549 [01:31<00:00,  5.99it/s]


0.3407654295760692


100% 549/549 [01:34<00:00,  5.83it/s]


0.3145608025844555


100% 549/549 [01:33<00:00,  5.90it/s]


0.292092177434402


100% 549/549 [01:31<00:00,  5.98it/s]


0.2726193665753272


100% 549/549 [01:32<00:00,  5.93it/s]


0.2555806573721958


100% 549/549 [01:32<00:00,  5.96it/s]


0.24054650150962129


100% 549/549 [01:31<00:00,  5.97it/s]


0.22718301181291856


100% 549/549 [01:32<00:00,  5.90it/s]


0.21522602419630307


100% 549/549 [01:33<00:00,  5.86it/s]

0.204464723358337





In [65]:
gc.collect()
torch.cuda.empty_cache()

df_test = pd.read_csv('/home/jone/NLP/Dataset/newszum_test_data.csv')
test_corpus = [t for t in df_test['cleanBody']]
test_label = to_categorical(LabelEncoder().fit_transform(df_test['category']))

test_dataset = GPTDataset(tokenizer, test_corpus, test_label, maxlen=max_len)
test_dataloader = DataLoader(test_dataset, batch_size=1, num_workers=2, shuffle=False)

In [66]:
model.eval()
fail = []
with torch.no_grad():
    for idx, (inputs, label) in enumerate(tqdm(test_dataloader, ncols=0)):
        input_ids = inputs[0].long().to(device)
        attention_mask = inputs[2].long().to(device)

        out = model(input_ids, attention_mask)
        pred = np.argmax(out.cpu().detach().numpy(), axis=1)[0]

        if pred != np.argmax(label.numpy()):
          fail.append(idx)

100% 942/942 [00:15<00:00, 59.89it/s]


In [67]:
print(len(fail)/len(test_dataset))

0.14225053078556263
