In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

import numpy as np

import gluonnlp as nlp

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import pandas as pd

In [2]:
df_train = pd.read_csv("./train_for_korean.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./test_for_korean.csv", encoding="utf-8-sig")

In [3]:
df_train = df_train.dropna(axis=0)
space_idx = []
for i in range(len(df_train)):
    if str.isspace(df_train.iloc[i, 1]) == True:
        space_idx.append(i)
df_train = df_train.drop(df_train.index[[space_idx]])

  result = getitem(key)


In [4]:
df_test = df_test.dropna(axis=0)
space_idx = []
for i in range(len(df_test)):
    if str.isspace(df_test.iloc[i, 1]) == True:
        space_idx.append(i)
df_test = df_test.drop(df_test.index[[space_idx]])

  result = getitem(key)


In [5]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [6]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [7]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [8]:
y_train = y_train.astype(np.long)
y_val = y_val.astype(np.long)
y_test = y_test.astype(np.long)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
bertmodel, vocab = get_pytorch_kobert_model()
bertmodel = bertmodel.from_pretrained("kobert_weight")

using cached model
using cached model


In [10]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [11]:
max_len = 64
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=max_len, pad=True, pair=False)

In [12]:
class CustomDataset(Dataset):
    def __init__(self, x, y, transform):
        self.idx = torch.tensor([transform([sentence])[0] for sentence in x]).reshape(len(x), -1)
        self.l = torch.tensor([transform([sentence])[1].item() for sentence in x])
        self.s = torch.tensor([transform([sentence])[2] for sentence in x]).reshape(len(x), -1)
        self.y = torch.tensor(y)
        
    def __getitem__(self, index):
        return (self.idx[index], self.l[index], self.s[index], self.y[index])

    def __len__(self):
        return len(self.idx)

In [13]:
trainset = CustomDataset(X_train, y_train, transform)
valset = CustomDataset(X_val, y_val, transform)
testset = CustomDataset(X_test, y_test, transform)

In [14]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [15]:
n_classes = 2

In [16]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [17]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 bert_size = 768,
                 hidden_size = 256,
                 n_layers = 2,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.lstm = nn.LSTM(bert_size, hidden_size, num_layers=n_layers, batch_first= True,  bidirectional=True)
                 
        self.classifier = nn.Linear(hidden_size*2 , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        with torch.no_grad():
            embeded = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        
        if self.dr_rate:
            embeded = self.dropout(embeded[0])
        else:
            embeded = embeded[0]
            
        packed_input = pack_padded_sequence(embeded, valid_length.tolist(), batch_first=True, enforce_sorted=False)
        packed_output,(hidden, cell) = self.lstm(packed_input)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        logit = self.classifier(hidden)
        
        return logit

In [18]:
model = BERTClassifier(bertmodel).to(DEVICE)
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [19]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [20]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , l, s, y = batch
        x = x.to(DEVICE)
        l = l.to(DEVICE)
        s = s.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x, l, s)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [21]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , l, s, y = batch
            x = x.to(DEVICE)
            l = l.to(DEVICE)
            s = s.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x, l, s)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [22]:
best_val_loss = None
n_epochs = 100
for epoch in range(n_epochs+1):
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    print("[Epoch: %d] val loss : %5.2f | val acuuracy : %5.2f" % (epoch, val_loss, val_accuracy))
    print("[Epoch: %d] train loss : %5.2f | train acuuracy : %5.2f" % (epoch, train_loss, train_accuracy))
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

[Epoch: 0] val loss :  0.17 | val acuuracy : 93.79
[Epoch: 0] train loss :  0.19 | train acuuracy : 92.66
[Epoch: 1] val loss :  0.17 | val acuuracy : 93.83
[Epoch: 1] train loss :  0.18 | train acuuracy : 92.91
[Epoch: 2] val loss :  0.16 | val acuuracy : 93.81
[Epoch: 2] train loss :  0.18 | train acuuracy : 92.98
[Epoch: 3] val loss :  0.16 | val acuuracy : 93.86
[Epoch: 3] train loss :  0.18 | train acuuracy : 93.07
[Epoch: 4] val loss :  0.16 | val acuuracy : 93.81
[Epoch: 4] train loss :  0.18 | train acuuracy : 93.11
[Epoch: 5] val loss :  0.17 | val acuuracy : 93.61
[Epoch: 5] train loss :  0.18 | train acuuracy : 93.13
[Epoch: 6] val loss :  0.17 | val acuuracy : 93.67
[Epoch: 6] train loss :  0.18 | train acuuracy : 93.20
[Epoch: 7] val loss :  0.17 | val acuuracy : 93.41
[Epoch: 7] train loss :  0.18 | train acuuracy : 93.20
[Epoch: 8] val loss :  0.16 | val acuuracy : 93.91
[Epoch: 8] train loss :  0.17 | train acuuracy : 93.36
[Epoch: 9] val loss :  0.17 | val acuuracy : 9

[Epoch: 77] val loss :  0.33 | val acuuracy : 92.57
[Epoch: 77] train loss :  0.05 | train acuuracy : 98.25
[Epoch: 78] val loss :  0.34 | val acuuracy : 92.88
[Epoch: 78] train loss :  0.05 | train acuuracy : 98.25
[Epoch: 79] val loss :  0.36 | val acuuracy : 92.65
[Epoch: 79] train loss :  0.05 | train acuuracy : 98.25
[Epoch: 80] val loss :  0.36 | val acuuracy : 92.70
[Epoch: 80] train loss :  0.05 | train acuuracy : 98.33
[Epoch: 81] val loss :  0.35 | val acuuracy : 92.64
[Epoch: 81] train loss :  0.05 | train acuuracy : 98.32
[Epoch: 82] val loss :  0.35 | val acuuracy : 92.54
[Epoch: 82] train loss :  0.05 | train acuuracy : 98.32
[Epoch: 83] val loss :  0.34 | val acuuracy : 92.38
[Epoch: 83] train loss :  0.05 | train acuuracy : 98.34
[Epoch: 84] val loss :  0.34 | val acuuracy : 92.79
[Epoch: 84] train loss :  0.04 | train acuuracy : 98.35
[Epoch: 85] val loss :  0.36 | val acuuracy : 92.74
[Epoch: 85] train loss :  0.04 | train acuuracy : 98.45
[Epoch: 86] val loss :  0.35

In [23]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [24]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(88.9446, device='cuda:0')
