In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from sklearn.model_selection import train_test_split

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from collections import Counter
from konlpy.tag import Mecab

import time

import numpy as np

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
df_train = pd.read_csv("./naver_train.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./naver_test.csv", encoding="utf-8-sig")

In [4]:
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터 보고 초딩 영화 줄오버 연기조차 가볍지 않구나,1
2,10265843,너무 재밓었다 그래서 보는 것을 추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0
4,6483659,사이몬 페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어 보이기만 했던 커스...,1


In [5]:
def delete_null(data):
    data = data.dropna(axis=0).reset_index(drop=True)
    space_idx = []
    for i in range(len(data)):
        if str.isspace(data.iloc[i, 1]) == True:
            space_idx.append(i)
    data = data.drop(space_idx)
    
    return data

In [6]:
df_train, df_test = map(delete_null, [df_train, df_test])

In [7]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [8]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [9]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [10]:
y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)
y_test = y_test.astype(np.int64)

In [11]:
class Vocabulary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
        
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

In [12]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer(text):
    return m.morphs(text)

def build_vocab(data, threshold):
    counter = Counter()

    for i in range(len(data)):
        tokens = tokenizer(data[i])
        counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<unk>')
    for w in words:
        vocab.add_word(w)
    return vocab

def tokenizing(data, max_length=256):
    data_size = len(data)
    length = []
    for i in range(data_size):
        data[i] = tokenizer(data[i])
        length.append(len(data[i]))
        if len(data[i]) > max_length:
            data[i] = data[i, :max_length]
        else:
            for _ in range(max_length-len(data[i])):
                data[i].append("<pad>")
    return data, length

In [13]:
vocab = build_vocab(X_train, 1)

In [14]:
print(list(vocab.word2idx.items())[:10])

[('<pad>', 0), ('<unk>', 1), ('진짜', 2), ('회', 3), ('때', 4), ('부터', 5), ('뀰잼', 6), ('으루', 7), ('보', 8), ('구', 9)]


In [15]:
X_train, X_val, X_test = map(tokenizing, [X_train, X_val, X_test])

In [16]:
print(X_train[0][0])

['진짜', '회', '때', '부터', '뀰잼', '으루', '보', '구', '있', '어', '욧', 'ㅋㅋ', '못', '보', '면', '다운', '받', '아서', '까지', '챙겨', '보', '는', '애청자', '입', '니', '당', 'ㅋㅋ', '빨리', '채연', '이랑', '이', '강준', '한', '민혁', '그', '엄마', '까지', '싹', '다', '태희', '군', '과', '사라', '양', '이', '통쾌', '한', '복수', '할', '수', '있', '도록', '잘', '만들', '어', '주', '세용', '♡', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

In [17]:
print(X_train[1][0])

58


In [18]:
def token2idx(data, vocab):
    data_size = len(data[0])
    sentence_length = len(data[0][0])
    for i in range(data_size):
        data[0][i] = [vocab(x) for x in data[0][i]]
    return data

In [19]:
X_train, X_val, X_test = map(lambda data : token2idx(data, vocab), [X_train, X_val, X_test])

In [20]:
print(X_train[0][0])

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 8, 15, 16, 17, 18, 19, 20, 8, 21, 22, 23, 24, 25, 13, 26, 27, 28, 29, 30, 31, 32, 33, 34, 19, 35, 36, 37, 38, 39, 40, 41, 29, 42, 31, 43, 44, 45, 10, 46, 47, 48, 11, 49, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:
print(X_train[1][0])

58


In [22]:
class CustomDataset(Dataset):
    def __init__(self, data, y,):
        self.x = data[0]
        self.length = data[1]
        self.y = y
        
    def __getitem__(self, index):
        return (torch.tensor(self.x[index]), self.length[index], self.y[index])
    
    def __len__(self):
        return len(self.x)

In [23]:
trainset = CustomDataset(X_train, y_train)
valset = CustomDataset(X_val, y_val)
testset = CustomDataset(X_test, y_test)

In [24]:
trainset[0]

(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  8, 15, 16, 17, 18,
         19, 20,  8, 21, 22, 23, 24, 25, 13, 26, 27, 28, 29, 30, 31, 32, 33, 34,
         19, 35, 36, 37, 38, 39, 40, 41, 29, 42, 31, 43, 44, 45, 10, 46, 47, 48,
         11, 49, 50, 51,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,

In [25]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [26]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [27]:
input_file = "glove.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [28]:
vocab_size = len(vocab.word2idx.keys())
embedding_size = 100
embedding_weight = np.zeros((vocab_size, embedding_size))
for i in range(2, vocab_size):
    if vocab.idx2word[i] in glove.key_to_index.keys():
        embedding_weight[i] = glove[vocab.idx2word[i]]
embedding_weight = torch.tensor(embedding_weight)

In [29]:
print(embedding_weight)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1209,  0.5049,  0.2698,  ..., -0.1276,  1.5774, -0.5342],
        ...,
        [ 0.0136, -0.0377,  0.3660,  ..., -0.3099,  0.8662,  0.0130],
        [-0.2016, -0.3475, -0.2892,  ..., -0.6292,  0.3447,  0.6372],
        [-0.1395,  0.1505, -0.0912,  ..., -0.0351, -0.0776,  0.7689]],
       dtype=torch.float64)


In [30]:
class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first= True,  bidirectional=True)
        self.out = nn.Linear(hidden_dim*2, n_classes, bias=True)

    def forward(self, x, length):
        embeded = self.embed(x)
        packed_input = pack_padded_sequence(embeded, length.tolist(), batch_first=True, enforce_sorted=False)
        packed_output,(hidden, cell) = self.lstm(packed_input)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        logit = self.out(hidden)
        return logit

In [31]:
n_classes = 2
model = LSTM(3, 256, vocab_size, 100, n_classes).to(DEVICE)
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [32]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , l, y = batch
        x = x.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x, l)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy 

In [33]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , l, y = batch
            x = x.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x, l)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [34]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
model.embed.weight.data.copy_(embedding_weight)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1209,  0.5049,  0.2698,  ..., -0.1276,  1.5774, -0.5342],
        ...,
        [ 0.0136, -0.0377,  0.3660,  ..., -0.3099,  0.8662,  0.0130],
        [-0.2016, -0.3475, -0.2892,  ..., -0.6292,  0.3447,  0.6372],
        [-0.1395,  0.1505, -0.0912,  ..., -0.0351, -0.0776,  0.7689]],
       device='cuda:0')

In [36]:
best_val_loss = None
n_epochs = 15
for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_accuracy:.2f}%')
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

Epoch: 01 | Epoch Time: 0m 56s
	Train Loss: 0.432 | Train Acc: 79.82%
	 Val. Loss: 0.387 |  Val. Acc: 81.75%
Epoch: 02 | Epoch Time: 0m 55s
	Train Loss: 0.363 | Train Acc: 83.63%
	 Val. Loss: 0.355 |  Val. Acc: 84.32%
Epoch: 03 | Epoch Time: 0m 54s
	Train Loss: 0.331 | Train Acc: 85.45%
	 Val. Loss: 0.369 |  Val. Acc: 82.99%
Epoch: 04 | Epoch Time: 0m 54s
	Train Loss: 0.308 | Train Acc: 86.80%
	 Val. Loss: 0.327 |  Val. Acc: 85.63%
Epoch: 05 | Epoch Time: 0m 54s
	Train Loss: 0.288 | Train Acc: 87.75%
	 Val. Loss: 0.323 |  Val. Acc: 85.95%
Epoch: 06 | Epoch Time: 0m 55s
	Train Loss: 0.270 | Train Acc: 88.73%
	 Val. Loss: 0.317 |  Val. Acc: 86.33%
Epoch: 07 | Epoch Time: 0m 55s
	Train Loss: 0.253 | Train Acc: 89.64%
	 Val. Loss: 0.316 |  Val. Acc: 86.27%
Epoch: 08 | Epoch Time: 0m 56s
	Train Loss: 0.237 | Train Acc: 90.42%
	 Val. Loss: 0.333 |  Val. Acc: 86.41%
Epoch: 09 | Epoch Time: 0m 56s
	Train Loss: 0.222 | Train Acc: 91.14%
	 Val. Loss: 0.323 |  Val. Acc: 86.22%
Epoch: 10 | Epoch T

In [37]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [38]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(86.4488, device='cuda:0')
