In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from sklearn.model_selection import train_test_split

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from konlpy.tag import Mecab

import time

import numpy as np

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
df_train = pd.read_csv("./train_for_korean.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./test_for_korean.csv", encoding="utf-8-sig")

In [4]:
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터 보고 초딩 영화 줄 오버 연기조차 가볍지 않구나,1
2,10265843,너무 재밓었다 그래서 보는 것을 추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0
4,6483659,사이몬 페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어 보이기만 했던 커스...,1


In [5]:
df_train = df_train.dropna(axis=0).reset_index(drop=True)
space_idx = []
for i in range(len(df_train)):
    if str.isspace(df_train.iloc[i, 1]) == True:
        space_idx.append(i)
df_train = df_train.drop(space_idx)

In [6]:
df_test = df_test.dropna(axis=0).reset_index(drop=True)
space_idx = []
for i in range(len(df_test)):
    if str.isspace(df_test.iloc[i, 1]) == True:
        space_idx.append(i)
df_test = df_test.drop(space_idx)

In [7]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [8]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [9]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [10]:
y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)
y_test = y_test.astype(np.int64)

In [11]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer(text):
    return m.morphs(text)

In [12]:
word2idx = {}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

count = 2

for i in range(len(X_train)):
    X_train[i] = tokenizer(X_train[i])
    for token in X_train[i]:
        if token not in word2idx.keys():
            word2idx[token] = count
            count += 1
    
for i in range(len(X_val)):
    X_val[i] = tokenizer(X_val[i])

for i in range(len(X_test)):
    X_test[i] = tokenizer(X_test[i])

In [13]:
idx2word = {y:x for x,y in word2idx.items()}

In [14]:
def sent2idx(data, word2idx):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] in word2idx.keys():
                data[i][j] = word2idx[data[i][j]]
            else:
                data[i][j] = word2idx["UNK"]
    return data

In [15]:
X_train = sent2idx(X_train, word2idx)
X_val = sent2idx(X_val, word2idx)
X_test = sent2idx(X_test, word2idx)

In [16]:
input_file = "glove.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [17]:
vocab_size = len(word2idx.keys())
embedding_size = 100
weight = np.zeros((vocab_size, embedding_size))
for i in range(2, vocab_size):
    if idx2word[i] in glove.key_to_index.keys():
        weight[i] = glove[idx2word[i]]

In [18]:
weight = torch.tensor(weight)
print(weight.shape)

torch.Size([45706, 100])


In [19]:
def make_tensor(data, word2idx):
    max_length = 0
    length_list = []
    
    for i in data:
        length_list.append(len(i))
        if len(i) == 0:
            print("ERROR")
            raise Exception()
        if len(i) > max_length:
            max_length = len(i)
            
    for i in data:
        for _ in range(max_length-len(i)):
            i.append(word2idx["PAD"])
    
    data = torch.tensor(data.tolist())
    
    return torch.tensor(data), length_list

In [20]:
X_train_idx, X_train_length = make_tensor(X_train, word2idx)
X_val_idx, X_val_length = make_tensor(X_val, word2idx)
X_test_idx, X_test_length = make_tensor(X_test, word2idx)

In [21]:
y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)
y_test = torch.tensor(y_test)

In [22]:
y_train = y_train.unsqueeze(1)
y_val = y_val.unsqueeze(1)
y_test = y_test.unsqueeze(1)

In [23]:
class CustomDataset(Dataset):
    def __init__(self, x_tensor, x_length, y_tensor):
        self.x = x_tensor
        self.l = x_length
        self.y = y_tensor

    def __getitem__(self, index):
        return (self.x[index], self.l[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [24]:
trainset = CustomDataset(X_train_idx, X_train_length, y_train)
valset = CustomDataset(X_val_idx, X_val_length, y_val)
testset = CustomDataset(X_test_idx, X_test_length, y_test)

In [25]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [26]:
n_classes = 2

In [27]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [48]:
class CNN(nn.Module):
    def __init__(self, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(n_vocab, embed_dim)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = 32, 
                                kernel_size = (3, embed_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = 32, 
                                kernel_size = (4, embed_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = 32, 
                                kernel_size = (5, embed_dim))
        
        self.fc = nn.Linear(3 * 32, n_classes)
        
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        return self.fc(cat)

In [49]:
model = CNN(vocab_size, 100, n_classes).to(DEVICE)
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [50]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , l, y = batch
        x = x.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy 

In [51]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , l, y = batch
            x = x.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [52]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [53]:
model.embedding.weight.data.copy_(weight)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.5399,  0.2587, -0.0871,  ..., -0.2010, -0.5775, -0.1200],
        ...,
        [-0.1770,  0.7781,  0.1157,  ..., -0.5335,  0.1376,  0.0870],
        [ 0.2609,  0.3327,  0.3659,  ...,  0.4778,  0.1309, -0.2498],
        [-0.1374, -0.3602, -0.2114,  ...,  0.3010, -0.3807, -0.3200]],
       device='cuda:0')

In [54]:
best_val_loss = None
n_epochs = 15
for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_accuracy:.2f}%')
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

Epoch: 01 | Epoch Time: 12m 21s
	Train Loss: 0.483 | Train Acc: 76.95%
	 Val. Loss: 0.405 |  Val. Acc: 81.36%
Epoch: 02 | Epoch Time: 12m 12s
	Train Loss: 0.388 | Train Acc: 82.65%
	 Val. Loss: 0.374 |  Val. Acc: 83.34%
Epoch: 03 | Epoch Time: 12m 10s
	Train Loss: 0.358 | Train Acc: 84.30%
	 Val. Loss: 0.357 |  Val. Acc: 84.26%
Epoch: 04 | Epoch Time: 12m 10s
	Train Loss: 0.336 | Train Acc: 85.53%
	 Val. Loss: 0.346 |  Val. Acc: 85.06%
Epoch: 05 | Epoch Time: 12m 10s
	Train Loss: 0.318 | Train Acc: 86.53%
	 Val. Loss: 0.342 |  Val. Acc: 85.27%
Epoch: 06 | Epoch Time: 12m 10s
	Train Loss: 0.302 | Train Acc: 87.35%
	 Val. Loss: 0.335 |  Val. Acc: 85.83%
Epoch: 07 | Epoch Time: 12m 10s
	Train Loss: 0.287 | Train Acc: 88.15%
	 Val. Loss: 0.333 |  Val. Acc: 85.84%
Epoch: 08 | Epoch Time: 12m 10s
	Train Loss: 0.274 | Train Acc: 88.77%
	 Val. Loss: 0.330 |  Val. Acc: 86.14%
Epoch: 09 | Epoch Time: 12m 19s
	Train Loss: 0.261 | Train Acc: 89.39%
	 Val. Loss: 0.331 |  Val. Acc: 86.11%
Epoch: 10 

In [35]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [36]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(86.1040, device='cuda:0')
