In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

import numpy as np

import gluonnlp as nlp

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import time

import pandas as pd

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
df_train = pd.read_csv("./train_for_korean.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./test_for_korean.csv", encoding="utf-8-sig")

In [4]:
df_train = df_train.dropna(axis=0).reset_index(drop=True)
space_idx = []
for i in range(len(df_train)):
    if str.isspace(df_train.iloc[i, 1]) == True:
        space_idx.append(i)
df_train = df_train.drop(space_idx)

In [5]:
df_test = df_test.dropna(axis=0).reset_index(drop=True)
space_idx = []
for i in range(len(df_test)):
    if str.isspace(df_test.iloc[i, 1]) == True:
        space_idx.append(i)
df_test = df_test.drop(space_idx)

In [6]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [7]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [8]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [9]:
y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)
y_test = y_test.astype(np.int64)

In [10]:
bertmodel, vocab = get_pytorch_kobert_model()
bertmodel = bertmodel.from_pretrained("kobert_weight")

using cached model
using cached model


In [11]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [12]:
max_len = 64
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=max_len, pad=True, pair=False)

In [13]:
class CustomDataset(Dataset):
    def __init__(self, x, y, transform):
        self.idx = torch.tensor([transform([sentence])[0] for sentence in x]).reshape(len(x), -1)
        self.l = torch.tensor([transform([sentence])[1].item() for sentence in x])
        self.s = torch.tensor([transform([sentence])[2] for sentence in x]).reshape(len(x), -1)
        self.y = torch.tensor(y)
        
    def __getitem__(self, index):
        return (self.idx[index], self.l[index], self.s[index], self.y[index])

    def __len__(self):
        return len(self.idx)

In [14]:
trainset = CustomDataset(X_train, y_train, transform)
valset = CustomDataset(X_val, y_val, transform)
testset = CustomDataset(X_test, y_test, transform)

In [15]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [16]:
n_classes = 2

In [17]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [18]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 bert_size = 768,
                 hidden_size = 256,
                 n_layers = 2,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.lstm = nn.LSTM(bert_size, hidden_size, num_layers=n_layers, batch_first= True,  bidirectional=True)
                 
        self.classifier = nn.Linear(hidden_size*2 , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        with torch.no_grad():
            embeded = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        
        if self.dr_rate:
            embeded = self.dropout(embeded[0])
        else:
            embeded = embeded[0]
            
        packed_input = pack_padded_sequence(embeded, valid_length.tolist(), batch_first=True, enforce_sorted=False)
        packed_output,(hidden, cell) = self.lstm(packed_input)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        logit = self.classifier(hidden)
        
        return logit

In [19]:
model = BERTClassifier(bertmodel).to(DEVICE)
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [20]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [21]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , l, s, y = batch
        x = x.to(DEVICE)
        l = l.to(DEVICE)
        s = s.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x, l, s)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [22]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , l, s, y = batch
            x = x.to(DEVICE)
            l = l.to(DEVICE)
            s = s.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x, l, s)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
best_val_loss = None
n_epochs = 100
for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    end_time = time.time()
    
    if (epoch+1) % 10 == 0:
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy:.2f}%')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_accuracy:.2f}%')
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

Epoch: 10 | Epoch Time: 4m 21s
	Train Loss: 0.172 | Train Acc: 93.42%
	 Val. Loss: 0.179 |  Val. Acc: 93.28%
Epoch: 20 | Epoch Time: 4m 13s
	Train Loss: 0.152 | Train Acc: 94.19%
	 Val. Loss: 0.190 |  Val. Acc: 93.16%
Epoch: 30 | Epoch Time: 4m 13s
	Train Loss: 0.118 | Train Acc: 95.57%
	 Val. Loss: 0.226 |  Val. Acc: 92.85%
Epoch: 40 | Epoch Time: 4m 13s
	Train Loss: 0.091 | Train Acc: 96.57%
	 Val. Loss: 0.266 |  Val. Acc: 92.55%
Epoch: 50 | Epoch Time: 4m 14s
	Train Loss: 0.073 | Train Acc: 97.32%
	 Val. Loss: 0.295 |  Val. Acc: 92.84%
Epoch: 60 | Epoch Time: 4m 13s
	Train Loss: 0.060 | Train Acc: 97.81%
	 Val. Loss: 0.335 |  Val. Acc: 92.42%
Epoch: 70 | Epoch Time: 4m 13s
	Train Loss: 0.053 | Train Acc: 98.07%
	 Val. Loss: 0.357 |  Val. Acc: 92.42%
Epoch: 80 | Epoch Time: 4m 13s
	Train Loss: 0.047 | Train Acc: 98.31%
	 Val. Loss: 0.367 |  Val. Acc: 92.64%
Epoch: 90 | Epoch Time: 4m 13s
	Train Loss: 0.042 | Train Acc: 98.45%
	 Val. Loss: 0.396 |  Val. Acc: 92.45%
Epoch: 100 | Epoch 

In [25]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [26]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(88.8724, device='cuda:0')
