<a href="https://colab.research.google.com/github/kant1724/naver-sentiment/blob/master/LSTM_Naver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [4]:
import pandas as pd

data = pd.read_csv('nsmc/ratings.txt', sep='\t')
train_data = pd.read_csv('nsmc/ratings_train.txt', sep='\t')
test_data = pd.read_csv('nsmc/ratings_test.txt', sep='\t')

train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [5]:
!pip install konlpy



In [6]:
from tqdm import tqdm
from konlpy.tag import Okt
okt = Okt()

train_data_val = train_data['document'].values.astype(str)
test_data_val = test_data['document'].values.astype(str)
train_x = []
test_x = []
for data in tqdm(train_data_val):
  train_x.append(okt.morphs(data))
for data in tqdm(test_data_val):
  test_x.append(okt.morphs(data))

100%|██████████| 150000/150000 [05:06<00:00, 488.70it/s]
100%|██████████| 50000/50000 [01:38<00:00, 505.28it/s]


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 50000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_x)

In [9]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100

train_x_padded = pad_sequences(tokenizer.texts_to_sequences(train_x), maxlen=maxlen)
test_x_padded = pad_sequences(tokenizer.texts_to_sequences(test_x), maxlen=maxlen)

train_x_padded[1]

Using TensorFlow backend.


array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,   941,     9,   472,
          55,   635,     3,   217,    46,  1626,    29,  1018,  6323,
       26892], dtype=int32)

In [10]:
import torch
import torch.nn as nn
import numpy as np

train_input = torch.LongTensor(train_x_padded)
test_input = torch.LongTensor(test_x_padded)

print(train_input)

tensor([[    0,     0,     0,  ...,    24,  7157,   688],
        [    0,     0,     0,  ...,  1018,  6323, 26892],
        [    0,     0,     0,  ...,   221,    18,    19],
        ...,
        [    0,     0,     0,  ...,    15, 17620,    17],
        [    0,     0,     0,  ...,    11,     4, 11248],
        [    0,     0,     0,  ...,     2,  3050,     3]])


In [11]:
train_target = torch.FloatTensor(train_data['label'].values)
test_target = torch.FloatTensor(test_data['label'].values)

print(train_target)

tensor([0., 1., 0.,  ..., 0., 1., 0.])


In [0]:
num_classes = 1
batch_size = 512
learning_rate = 0.001
embedding_dim = 300
hidden_size = 128
num_layers = 1
vocab_size = max_features

In [0]:
class GlobalMaxPooling1D(nn.Module):
    def __init__(self):
        super(GlobalMaxPooling1D, self).__init__()

    def forward(self, inputs):
        z, _ = torch.max(inputs, 1)
        return z

    def __repr__(self):
        return self.__class__.__name__ + '()'

class Model(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers):
        super(Model, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size) 
        self.proj = nn.Linear(embed_size, hidden_size)             
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, bidirectional=False)        
        self.pooling = GlobalMaxPooling1D()
        self.linear = nn.Linear(hidden_size, hidden_size)        
        self.dense = nn.Linear(hidden_size, 1)

    def forward(self, x): 
        x = self.embed(x)        
        out = torch.relu(self.proj(x))                
        out, (h, c) = self.lstm(out)                
        out = torch.relu(self.linear(self.pooling(out)))                        
        out = self.dense(out)

        return out

    def predict(self, x):
        preds = []
        with torch.no_grad():
            out = self.forward(x)
            preds.append(out)
        return torch.cat(preds)

    def predict_proba(self, x):
        return torch.sigmoid(self.predict(x))

In [0]:
model = Model(embedding_dim, hidden_size, num_layers).cuda()

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
num_epoch = 5
for epoch in range(num_epoch):    
    train = torch.utils.data.TensorDataset(train_input.cuda(), train_target.cuda()) 
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False)    
    cnt = 0
    for x_batch, y_batch in tqdm(train_loader, disable=True):        
        outputs = model(x_batch)        
        loss = criterion(np.squeeze(outputs), y_batch)
        if cnt % 100 == 0:
            print('epoch:' + str(epoch) + ', loss:' + str(loss))
        model.zero_grad()
        loss.backward()
        optimizer.step()
        cnt += 1

epoch:0, loss:tensor(0.6931, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:0, loss:tensor(0.4387, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:0, loss:tensor(0.3968, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:1, loss:tensor(0.3798, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:1, loss:tensor(0.3256, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:1, loss:tensor(0.3241, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:2, loss:tensor(0.3089, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:2, loss:tensor(0.2663, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:2, loss:tensor(0.2768, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:3, loss:tensor(0.2528, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
epoch:3, loss:tensor(0.2281, device='cuda:0', grad_fn=<Binar

In [0]:
out = []
with torch.no_grad():
    test = torch.utils.data.TensorDataset(test_input.cuda())    
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    for i, (x_batch,) in enumerate(test_loader):        
        res = model.predict_proba(x_batch).cpu().numpy()
        for re in res:          
          out.append(re)

In [17]:
result = []
for o in out:
  if o > 0.9:
    result.append(1)
  else:
    result.append(0)

test_label = test_data['label'].values

tot_cnt = len(result)
cnt = 0
for i in range(len(result)):
  if result[i] == test_label[i]:
    cnt += 1

print(cnt / tot_cnt)


0.82918
