In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2ForSequenceClassification

warnings.filterwarnings('ignore')

In [4]:
tr = pd.read_csv('./data_in/open/train_data.csv', index_col='index')


In [18]:
tokenizer = AutoTokenizer.from_pretrained("skt/kobert")

model = GPT2ForSequenceClassification.from_pretrained("klue/kobert")
model.score = torch.nn.Linear(768, 7)
model.cuda()

Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

You are using a model of type roberta to instantiate a model of type gpt2. This is not supported for all configurations of models and can yield errors.
Can't set hidden_size with value 1024 for GPT2Config {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "model_type": "gpt2",
  "pad_token_id": 1,
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.8.2"
}



AttributeError: can't set attribute

In [6]:
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document, label = str(record['title']), int(record['topic_idx'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float),
                'labels': np.array(label, dtype=np.int_)}
    
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document = str(record['title'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float)}

In [7]:
# train parameters
epochs = 10
batch_size = 32

In [8]:
# train loader
train_ds = TrainDataset(tr, tokenizer)
loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)

In [9]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, )
loss_fn = torch.nn.CrossEntropyLoss()

In [10]:
model.train()
for e in range(epochs):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).long().cuda()
        labels = torch.tensor(labels).long().cuda()
        pred = model(ids, attention_mask=atts)
        loss = loss_fn(pred[0], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        optimizer.step()
        total_loss += loss.item()
        
    scheduler.step()
    print(e, total_loss)

0 798.4672225415707
1 425.72672291472554
2 216.4520208099857
3 91.64503619493917
4 37.197267876743354
5 23.07639458939957
6 27.07165218438513
7 37.48703969925191
8 68.31385060424645
9 99.99077906967432


In [12]:
# test loader
te = pd.read_csv('./data_in/open/test_data.csv', index_col='index')

test_ds = TestDataset(te, tokenizer)
test_loader = DataLoader(test_ds, 8)

In [13]:
preds = []
model.eval()

for b in tqdm(test_loader):
    ids, atts = b['input_ids'], b['attention_mask']
    ids = torch.tensor(ids).long().cuda()
    atts = torch.tensor(atts).long().cuda()
    pred = model(ids, attention_mask=atts)
    preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))
#     break

100%|██████████████████████████████████████████████████████████████████████████████| 1142/1142 [00:14<00:00, 77.63it/s]


In [14]:
sub = pd.read_csv('./data_in/open/sample_submission.csv', index_col='index')
sub['topic_idx'] = preds
sub.head(20)

Unnamed: 0_level_0,topic_idx
index,Unnamed: 1_level_1
45654,3
45655,3
45656,0
45657,2
45658,3
45659,2
45660,5
45661,3
45662,4
45663,4


In [15]:
sub.to_csv('./gpt.csv')

In [16]:
te.head(20)

Unnamed: 0_level_0,title
index,Unnamed: 1_level_1
45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영
45655,어버이날 맑다가 흐려져…남부지방 옅은 황사
45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다
45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것
45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간
45659,야외서 생방송 하세요…액션캠 전용 요금제 잇따라
45660,월드컵 태극전사 16강 전초기지 레오강 입성종합
45661,미세먼지 속 출근길
45662,왓츠앱稅 230원에 성난 레바논 민심…총리사퇴로 이어져종합2보
45663,베트남 경제 고성장 지속…2분기 GDP 6.71% 성장
