## 1. LSTM 모델을 이용한 NLP Classification (스팸메일분류기) 

### 1-1. Fully Connected Layer (FCL)

In [2]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

In [3]:
# ANN (FCL) pytorch로 구성
# nn structure: input -> fc1 -> fc2 -> output
class ANN(nn.Module):
    def __init__(self, num_output, input_size, hidden_size, device):
        super(ANN, self).__init__()
        self.device = device
        
        #linear 모델 사용
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.outlayer = nn.Linear(hidden_size, num_output)
        
    def forward(self, x):
        #relu() if input is <=0 => return 0
        h = self.fc1(x).relu()
        h = self.fc2(h).relu()
        predict = self.outlayer(h)
        return predict

### 1-2. LSTM for NLP

In [35]:
# structure: input (정수 인코딩된 결과) -> Embed Layer (word2vec역할) -> LSTM
class LSTM_net(nn.Module):
    def __init__(self, num_output, size_vocab, dim_embed, hidden_size, linear_size, num_layers, device):
        super(LSTM_net, self).__init__()
        self.device = device #GPU
        self.num_output = num_output #1 (스팸인지 아닌지 구분하는 목적이기 때문에)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # (단어갯수,임베딩차원)
        self.embed = nn.Embedding(size_vocab, dim_embed)
        
        #layer 양이 많아질 수록 복잡해짐, dropout: 정규화목적, bidirection/singledirection선택
        self.lstm = nn.LSTM(input_size = dim_embed, hidden_size = hidden_size,
                           num_layers = num_layers, dropout = 0.3, bidirectional = True)
        # 분류 문제에 사용할 것이기에 lstm 다음에 fcl 구현 이 후 output layer
        self.fclayer = nn.Linear(hidden_size, linear_size)
        self.outlayer = nn.Linear(linear_size, num_output)
    
    # x: 정수인코딩 (batch_size, seq_len)
    def forward(self, x):
        scaler = 2 if self.lstm.bidirectional == True else 1
        emb = self.embed(x) #word2vec 결과 (batch_size, seq_len, dim_embed)
        
        # hidden state 초기화 (num_layer *scaler, batch_size, hidden_size(LSTM의 hidden layer size))
        h_state = Variable(torch.zeros(self.num_layers * scaler, emb.size(0),
                                      self.hidden_size, requires_grad = True)).to(self.device)
        # cell state 초기화 
        c_state = Variable(torch.zeros(self.num_layers * scaler, emb.size(0),
                                      self.hidden_size, requires_grad = True)).to(self.device)
        
        #lstm 에서는 em: [seq_len, batch, dim_embed] 로 들어가야하므로 transpose 이용
        lstm_out, (h, c) = self.lstm(emb.transpose(1,0), (h_state, c_state))
        
        # 마지막 시간의 hidden layer만 사용할 것이기에 -1 index로 호출
        h = h[-1]
        h = self.fclayer(h).relu()
        #최종  predict 차원 [batch, num_output]
        predict = self.outlayer(h)
        
        return predict

### 1-3. 데이터 전처리

In [9]:
import os
import pandas as pd
data = pd.read_csv("spam_assassin.csv") #kaggle에 있는 open dataset 이용
display(data.info(), data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


None

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [11]:
# 토큰화
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
data = data.dropna().reset_index(drop=True) #missing variable 확인
token_text = []
for i in range(len(data)):
    token = word_tokenize(data.iloc[i,0])
    token_stop_text = []
    for w in token:
        if w not in stop_words:
            token_stop_text.append(w)
    token_text.append(token_stop_text)
print('After cleaning:', len(token_text))

After cleaning: 5796


In [13]:
#  정수 인코딩
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_text)
print(len(tokenizer.word_index))

165354


In [14]:
text_encoded = tokenizer.texts_to_sequences(token_text)
#첫번째 메일에대한 정수인코딩결과 예시로 확인
print(text_encoded[0])

[35, 181, 6, 59, 47, 44, 225, 69713, 12, 79, 3, 2, 181, 6, 59, 1, 68, 3, 78, 6, 179, 15, 3, 16, 8, 16, 14, 33, 13, 7, 160, 8, 48, 7, 25, 18, 69714, 2, 36, 6, 16, 1, 10, 47, 5, 225, 44, 12, 69715, 61, 8, 102, 7, 15, 3, 214, 14, 33, 13, 16, 94, 8, 90, 7, 36, 6, 16, 8, 91, 7, 10, 47, 5, 225, 44, 12, 69716, 28, 8, 60, 7, 15, 3, 119, 8, 220, 6, 119, 14, 508, 13, 7, 58, 8, 64, 7, 25, 18, 69717, 2, 582, 6, 55, 1, 10, 159, 5, 321, 44, 12, 69718, 28, 15, 3, 412, 8, 220, 6, 16, 14, 33, 13, 7, 119, 8, 193, 7, 25, 18, 69719, 10, 159, 5, 321, 44, 12, 69720, 28, 424, 3, 119, 3, 472, 220, 6, 16, 14, 33, 13, 514, 412, 15, 3, 8358, 8, 8358, 14, 18035, 13, 7, 119, 8, 193, 7, 25, 18, 69721, 2, 107, 6, 59, 1, 10, 159, 5, 321, 44, 12, 43445, 28, 15, 3, 69722, 8, 14, 69723, 13, 16372, 7, 8358, 25, 8, 237, 1048, 17, 67, 7, 18, 69724, 107, 6, 59, 10, 159, 5, 321, 44, 12, 69725, 28, 15, 3, 8, 13528, 6, 16, 7, 6377, 8, 11791, 7, 18, 69726, 107, 6, 59, 10, 159, 5, 321, 44, 12, 43446, 28, 63, 3, 159, 5, 321, 44, 

In [15]:
# 학습을 위한 Label: spam 인 경우 1 아니면 0
text_label = np.array(data.iloc[:,1])

In [17]:
# padding 및 데이터 자르기
print(np.shape(text_encoded))
print(np.shape(text_label))

(5796,)
(5796,)


In [21]:
maxlen = 0
for w in text_encoded:
    if len(w) >= maxlen:
        maxlen = len(w)
print(maxlen) 

15067


In [23]:
max_len = 100 #너무 길어서 임의로 100단어까지만 설정
rowdata = []
for w in text_encoded:
    if len(w) >= max_len:
        rowdata.append(w[:max_len])
    else:
        rowdata.append(np.pad(w, (0, max_len), 'constant', constant_values = 0)[:max_len])
text_padded = np.concatenate(rowdata, axis = 0).reshape(-1, max_len)
print(np.shape(text_padded))

(5796, 100)


> 이메일은 보통 다수의 문장으로 이루어져있기 때문에 샘플의 길이가 길 수 있음. </br>
따라서 maxlen을 설정하여 이 수 이하의 토큰을 가진 이메일은 pdding, 이상의 토큰을 가진 이메일은 나머지를 버림

### 1-4. 학습
5000개 training 500개 test

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import LongTensor as LT
from torch import FloatTensor as FT

In [32]:
class Generate_Dataset(torch.utils.data.Dataset):
    def __init__(self, xdata, ydata, device):
        self.x_data = xdata
        self.y_data = ydata
        self.device = device
        
    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        #정수 인코딩 해서 LongTensor를 사용함
        x = LT(self.x_data[idx]).to(self.device)
        y = LT(self.y_data[idx]).to(self.device)
        return x,y

In [38]:
# dataset 생성
# x (input) : text_padded [5796,100] seq_len
# y (label): text_labeled [5796, 1] (1또는 0으로 이루어짐)

dataset = Generate_Dataset(text_padded[:5000, :], text_label[:5000].reshape([-1, 1]), device)
trainset, testset = random_split(dataset, [4500, 500])
# 학습하기 쉽도록 dataloader 를 이용
train_loader = DataLoader(trainset, batch_size = 256, shuffle = True)

#test는 한번에 할꺼라서 batch_size=500으로 하고 셔플하지않음
test_loader = DataLoader(testset, batch_size = 500, shuffle = False) 

In [39]:
# Define Netword and Optimizer
# output: binary classification을 one hot encoding 인 [1,0], [0,1]로 가져오려하기 때문에
lstm_net = LSTM_net(num_output = 2, size_vocab = len(tokenizer.word_index), dim_embed = 64,
                   hidden_size = 64, linear_size = 64, num_layers = 1, device = device)
optimizer = torch.optim.Adam(lstm_net.parameters(), lr = 0.01)

In [41]:
# Training Session
from tqdm import tqdm
for epoch in range(10):
    print('Epoch', epoch)
    with tqdm(train_loader, unit = 'batch') as tepoch:
        for x, y in tepoch:
            predict = lstm_net(x)
            # ravel() 이용하여 행렬의 불필요한 차원을 제거함
            loss = torch.nn.functional.cross_entropy(predict, y.ravel())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(loss)

Epoch 0


100%|████████████████████████████████████████| 18/18 [00:06<00:00,  2.60batch/s]


tensor(0.0054, grad_fn=<NllLossBackward0>)
Epoch 1


100%|████████████████████████████████████████| 18/18 [00:06<00:00,  2.59batch/s]


tensor(4.9930e-05, grad_fn=<NllLossBackward0>)
Epoch 2


100%|████████████████████████████████████████| 18/18 [00:06<00:00,  2.58batch/s]


tensor(3.0165e-05, grad_fn=<NllLossBackward0>)
Epoch 3


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.54batch/s]


tensor(2.3614e-05, grad_fn=<NllLossBackward0>)
Epoch 4


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.52batch/s]


tensor(0.0043, grad_fn=<NllLossBackward0>)
Epoch 5


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.45batch/s]


tensor(9.1700e-06, grad_fn=<NllLossBackward0>)
Epoch 6


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.50batch/s]


tensor(1.2639e-05, grad_fn=<NllLossBackward0>)
Epoch 7


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.49batch/s]


tensor(1.0264e-05, grad_fn=<NllLossBackward0>)
Epoch 8


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.49batch/s]


tensor(0.0074, grad_fn=<NllLossBackward0>)
Epoch 9


100%|████████████████████████████████████████| 18/18 [00:07<00:00,  2.49batch/s]

tensor(0.0043, grad_fn=<NllLossBackward0>)





#### 1-5. Test the Performance

In [42]:
with tqdm(test_loader, unit = 'batch') as tepoch:
    for x, y in tepoch:
        # predict 차원 [500,2] -> 1차원에 관해 argmax 하면 큰 값 반환하여 [500,1] 반환
        predict = lstm_net(x).argmax(1).detach().numpy()
        answer = y.ravel().detach().numpy()
score = 0
for i in range(len(predict)):
    if predict[i] == answer[i]:
        score += 1
print(score, 'out of 500, accuracy is ', score/500*100, '%')

100%|██████████████████████████████████████████| 1/1 [00:00<00:00,  3.39batch/s]

477 out of 500, accuracy is  95.39999999999999 %



