# 1. LSTM 모델을 이용한 NLP Classification(스팸 메일 분류기)

### LSTM 모델 사용하여 스팸 메일 분류 과정을 LSTM설계, 데이터 전처리 과정

#### 1.1 Fully Connceted Layer 복습
#### - RNN과 LSTM 모델을 학습하기에 앞서 기본적인 ANN(Fully Connceted Layer)를 Python로 구성하는 것을 복습

In [2]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

In [3]:
class ANN(nn.Module):
    def __init__(self, num_output, input_size, hidden_size, device):
        super(ANN, self).__init__()
        self.device = device
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        self.outlayer = nn.Linear(hidden_size, num_ouput)
        
    def forward(self, x):
        h = self.fc1(x).relu()
        h = self.fc2(h).relu()
        predict = self.outlayer(h)
        return predict


### 1.2 LSTM for NLP
#### - 가장 보편적으로 쓰이는 recurrent neural network 구조인 LSTM을 PyTorch로 꾸미는 과정. 기본적으로 텍스트를 다룰때에는 word2vec을 사용해도 되지만, nn.Embedding 레이어를 사용해 정수 인코딩 결과를 word2vec으로 만들어주는 레이어 사용

In [7]:
class LSTM_net(nn.Module):
    def __init__(self, num_output, size_vocab, dim_embed, hidden_size, linear_size, num_layers, device):
        super(LSTM_net, self).__init__()
        self.device = device # GPU
        self.num_output = num_output # 1
        self.hidden_size = hidden_size # 128
        self.num_layers = num_layers # 2
        
        self.embed = nn.Embedding(size_vocab, dim_embed)
        
        self.lstm = nn.LSTM(input_size = dim_embed, hidden_size = hidden_size,
                           num_layers = num_layers, dropout = 0.3, bidirectional = True)
        self.fclayer = nn.Linear(hidden_size, linear_size)
        self.outlayer = nn.Linear(linear_size, num_output)
        
    def forward(self, x):   # x = 정수 인코딩 결과 [batch, seq_len]
        scaler = 2 if self.lstm.bidirectinal == True else 1
        
        emb = self.embed(x) # word2vec 결과 [batch, seq_len, dim_embed]
        
        h_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                      self.hidden_size, requlres_grad = True)).to(self.device)
        c_state = Variable(torch.zeros(self.num_layers*scaler, emb.size(0),
                                      self.hidden_size, requlres_grad = True)).to(self.device)
        lstm_out, (h,c) = self.lstm(emb.transpose(1,0), (h_state, c_state))
        # 실제로 lstm   # (h,c) = (hidden, cell)
        # emb : [seq_len, batch, dim_embed], 1번째 2번째 차원을 바꾸기 위해 transpose넣어줌
        # seq_len가 가장 앞으로 오게 만들어야 함
        
        h = h[-1] # important # 마지막 타임의 hidden만 가져오겠다 # h = [batch, hidden]
        h = self.fclayer(h).relu()
        predict = self.outlayer(h)  # predict = [batch, num_output]
        return predict

### 1.3 Spam Mail Classification : 데이터 전처리
####  스팸 메일을 분류할 수 있는 이진 분류기를 LSTM을 이용해 꾸며보는 예시. 

In [8]:
import os
import pandas as pd

In [None]:
data = pd.read_csv('')
display(data.info(), data.head())

- 토큰화

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

stop_words = set(stopwords.words('english'))
data = data.dropna().reset_index(drop=True)
token_text = []
for i in range(5728):
    token = word_tokenize(data.iloc[i,0])
    token_stop_text = []
    for w in token:
        if w not in stop_words:
            token_stop_text.append(w)
    token_text.append(token_stop_text)
    
print('After cleaning :', len(token_text))

- 정수 인코딩

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_text)
print(len(tokenizer.word_index))

In [None]:
text_encoded = tokenizer.texts_to_sequences(token_text)
print(text_encoded[0]) # 첫번째 메일의 정수 인코딩 결과

- 학습을 위한 Label : Spam인 경우 1, Normal Text인 경우 0

- Padding 및 데이터 자르기
-- 이메일은 보통 다수의 문장으로 이루어져 있기 때문에 정제 및 추출을 거치더라도 1개 샘플의 길이가 길 수 있습니다. 따라서 maxlen을 설정하여 maxlen 이하의 토큰을 가진 이메일은 padding을, maxlen 이상의 토큰을 가진 이메일은 첫 100개만 사용하고 나머지는 버림

In [None]:
print(np.shape(text_encoded))
print(np.shape(text_label))
maxlen = 0
for w in text_encoded:
    if len(w) >= maxlen:
        maxlen = len(w)
print(maxlen)

maxlen = 100
rowdata = []
for w in text_encoded:
    if len(w) >= maxlen:
        rowdata.append(x[:maxlen])
    else:
        rowdata.append(np.pad(w, (0,maxlen), 'constant', constant_values=0)[:maxlen])
text_padded = np.concatenate(rowdata, axis=0).reshape(-1, maxlen)
print(np.shape(text_padded))

### 1.4 학습을 위한 Dataset 만들기 및 학습 과정

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
import torch.utils.data import DataLoader, TensorDataset, random_split
from torch import LongTensor as LT
from torch import FloatTensor as FT

class Generate_Dataset(torch.utils.data.Dataset):
    def __init__(self, xdata, ydata, device):
        self.x_data = xdata
        self.y_data = ydata
        self.device = device
        
    def __len__(self):
        return len(self.x_data)
    def __getitem__(self, idx):
        x = LT(self.x_data[idx]).to(self.device)
        y = LT(self.x_data[idx]).to(self.device)   
        return x,y

- Generate Dataset

In [None]:
datasett = Generate_Dataset(text_padded[:5000,:].reshape([-1,1]), device)
trainset, testset = random_split(dataset, [4500,500])
train_loader = DataLoader(trainset, batch_size = 256, shuffle =True)
test_loader = DataLoader(testset, batch_size = 500, shuffle = False)

- Define Network and Optimizer

In [None]:
lstm_net = LSTM_net(num_output = 2, size_vocab = len(tokenizer, word_index), dim_embed = 64,
                   hidden_size = 64, linear_size = 64, num_layers = 1, device = device)
optimizer = torch.optim.Adam(lstm_net.parameters(), lr = 0.01)

- Training Session

In [None]:
from tqdm import tqdm
for epoch in range(10):
    print('Epoch', epoch)
    with tqdm(train_loader, unit = 'batch') as tepoch:
        for x,y in tepoch:
            predict = lstm_net(x) # x = email word --> predict = [batch, 2]
            loss = torch.nn.functional.cross_entropy(predict, y.ravel())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(loss)
#             print(loss)
#             tepoch.set_description(f"Epoch {epoch}")
#             tepoch.set_postrix(loss = loss.item())

# loss = (predict 자리에 원핫 인코딩 결과가 들어가야함)
# y = [batch, 1] 에서 1(차원)이 사라지고 batch 차원으로 만들어야 하기 때문에 ravel 사용

- Test the Performance

In [None]:
with tqdm(test_loader, unit='batch') as tepoch:
    for x, y in tepoch:
        predict = lstm_net(x).argmax(1).detach().numpy()
        answer = y.revel().detach().numpy()
score = 0
for i in range(len(predict)):
    if predict[i] == answer[i]:
        score += 1
print(score, 'out of 500, accuracy is', score/500*100, '%')

# 500개 테스트 해서 처음 predict = [500, 2]


## NLP 문제 해결
### 1. Dataset 전처리
#### - token -> 정제/추출 -(이 과정에서 padding 진행)-> 정수인코딩
       padding :  기계번역시 사용 x, 분류문제시 사용 o
                  미니배치 사이즈 똑같이 맞추기 위해서 padding
### 2. Netword 구조

# 2. seq2seq 모델 이용한 NLP machine translation
### LSTM 모델 이용한 seq2seq 모델에서 기계 번역 구현 

### 2.1 Download Dataset

In [18]:
import os
import spacy
os.system("python -m spacy download en_core_web_sm")
os.system("python -m spacy download de_core_news_sm")

# Source from [1]
spacy_german = spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')

In [36]:
def tokenize_de(text):
    return [tok.text for tok in spacy_german.tokenizer(text)][::-1]
def tokenize_en(text):
    return [tok.text for tok in spacy_english.tokenizer(text)]
SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

NameError: name 'Field' is not defined

In [38]:
from torchtext.datasets import Multi30k
from torchtext.legacy.data import Field

ModuleNotFoundError: No module named 'torchtext.legacy'

### 2.2 Network Structures

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

In [40]:
class seq_Encoder(nn.Module):
    def __init__(self, vocab_size, dim_embed, hidden_size, num_layers, dropout):
        super().__init__()
        self.hidden_size = hidden_size 
        self.num_layers = num_layers 
        
        self.embed = nn.Embedding(vocab_size, dim_embed)
        self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        outputs, (hidden, cell) = self.lstm(self.dropout(self.embed(src)))
        return hidden, cell

In [41]:
class seq_Decoder(nn.Module):
    def __init__(self, output_size, dim_embed, hidden_size, num_layers, dropout):
        super().__init__()
        self.output_size = output_size 
        self.hidden_size = hidden_size 
        self.num_layers = num_layers 
        
        self.embed = nn.Embedding(output_size, dim_embed)
        self.lstm = nn.LSTM(dim_embed, hidden_size, num_layers, dropout = dropout)
        self.fclayer = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,input_data, hidden, cell):
        input_data = input_data.unsqueeze(0)
        embedded = self.dropout(self.embed(input_data))
        outputs, (hidden, cell) = self.lstm(embedded,(hidden, cell))
        prediction = self.fclayer(output.squeeze(0))
        return (prediction, hidden, cell)

In [42]:
import random

    class seq2seq(nn.Module):
        def __init__(self, encoder, decoder, device):
            super().__init__()
            self.encoder = encoder
            self.decoder = decoder
            self.device = device

        def forward(self, source, target, tf_ratio = 0.5):
            batch_size = target.shape[1]
            translation_length = target.shape[0]
            target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(translation_length, batch_size, target_vocab_size).to(self.device)
        hidden, cell = self.encoder(source)
        input_trans = target[0,:]
        
        for t in range(1, translation_length):
            output, hidden, cell = self.decoder(input_trans, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < tf_ratio
            input_trans = target[t] if teacher_force else output.argmax(1)
        return outputs

In [43]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
enc = seq_Encoder(len(SRC.vocab), 64, 64, 1, 0.3)
dec = seq_Decoder(len(TRG.vocab), 64, 64, 1, 0.3)

  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


NameError: name 'SRC' is not defined

In [None]:
seq_net = seq2seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(seq_net.parameters(), lr =0.01)

In [None]:
from torchtext.lagacy.data import Bucketlterator
train_iterator, valid_iterator, test_iterator = Bucketlterator.splits((train_data, vaild_data, test_data),
                                                                     batch_size=256, device = device)

### 2.3 Train the Translator Network

In [None]:
pad_index = TRG.vocab.stoi[TRG.pad_token] # padding이 일어나는 토큰
lossfcn = nn.CrossEntropyLoss(ignore_index = pad_index)

for epoch in range(10):
    loss_epoch = 0
    for batch in train_iterator:
        source_data = batch.src
        target_data = batch.trg
        target_pred = seq_net(source_data,target_data)
        target_pred = target_pred[1:].view(-1, target_pred.shape[-1])
        target_data = target_Data[:1].view(-1)
        optimizer.zero_grad()
        loss = lossfcn(target_pred, target_data)
        loss.backward()
        optimizer.step()
        loss_epoch += loss.item()
    print("Epoch", epoch, 'Loss', loss_epoch/len(train_iterator))