### 문장 분류 모델
- p.324 - p.330


In [1]:
from torch import nn

In [2]:
class SentenceClassifier(nn.Module):
    def __init__(
            self,
            n_vocab, hidden_dim,			# n_vocab=> 단어사전 최대 길이
            embedding_dim,
            n_layers,
            dropout=0,
            bidirectinal=True,
            model_type="lstm"
	):
        super().__init__()
        
        self.embedding = nn.Embedding(
               num_embeddings=n_vocab,
               embedding_dim=embedding_dim,
               padding_idx=0
		)
        
        if model_type == "rnn":
            self.model == nn.RNN(
                input_size = embedding_dim,
                hidden_size= hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectinal,
                dropout=dropout,
                batch_first=True,
			)
            
        elif model_type=="lstm":
            self.model = nn.LSTM(
				input_size = embedding_dim,
                hidden_size= hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectinal,
                dropout=dropout,
                batch_first=True,
			)
        
		# 양방향학습 True
        if bidirectinal:
            self.classifier = nn.Linear(hidden_dim*2, 1)
             
        else:  
            self.classifier = nn.Linear(hidden_dim, 1)

        self.dropout = nn.Dropout(dropout)  
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _  = self.model(embeddings)
             
        last_output = output[:,-1,:]				# linear에 넣기 위한 Flatten
													# 마지막 시점만 결과만 분리해 분류기 계층 전달 
        last_output = self.dropout(last_output)
        
        logits = self.classifier(last_output)
        
        return logits

In [3]:
# 데이터셋 불러오기-------------------------------------------------------------------
import pandas as pd
from Korpora import Korpora

In [4]:
corpus = Korpora.load("nsmc")
corpusDF = pd.DataFrame(corpus.test)

train = corpusDF.sample(frac=0.9, random_state=42)
test = corpusDF.drop(train.index)

print(train.head(5).to_markdown())
print(f"Training Data Size: {len(train)}")
print(f"Testing Data Size: {len(test)}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-43\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [5]:
# 데이터 토큰화 및 단어 사전 구축
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
	counter = Counter()
    
	for tokens in corpus:
		counter.update(tokens)
	
	vocab = special_tokens

	for token, count in counter.most_common(n_vocab):
		vocab.append(token)

	return vocab

tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>","<unk>"])
token_to_id = {token: idx for idx, token in enumerate(vocab)}
it_to_token = {idx: token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [6]:
# 정수 인코딩 & 패딩
import numpy as np

def pad_sequences(sequences, max_length, pad_value):
    
	result = list()

	for seq in sequences:
		seq = seq[:max_length]
		pad_length = max_length - len(seq)

		padded_sequnce = seq + [pad_value]*pad_length

		result.append(padded_sequnce)

	return np.asarray(result)

unk_id = token_to_id["<unk>"]

train_ids = [
	[token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]

test_ids = [
	[token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length = 32
pad_id = token_to_id["<pad>"]
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [7]:
# 데이터로더 적용
import torch 
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_DS = TensorDataset(train_ids, train_labels)
test_DS = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_DS, batch_size=16, shuffle=True)
test_loader = DataLoader(test_DS, batch_size=16, shuffle=False)

In [8]:
# 손실함수 & 최적화 함수 정의
from torch import optim

n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers = 2

device = "cuda" if torch.cuda.is_available() else "cpu"

classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers
).to(device)

criterion = nn.BCEWithLogitsLoss().to(device)
optimzier = optim.RMSprop(classifier.parameters(), lr=0.001)

In [9]:
# 모델 학습 및 테스트
def train(model, datasets, criterion, optimzier, device, interval):
	model.train()
	loss_tot = []		# list()
    
	for step, (input_ids, labels) in enumerate(datasets):
		input_ids = input_ids.to(device)
		labels = labels.to(device).unsqueeze(1)

		logits = model(input_ids)

		loss = criterion(logits, labels)
		loss_tot.append(loss.item())

		optimzier.zero_grad()
		loss.backward()
		optimzier.step()

		if step % interval ==0:
			print(f'Train loss {step} : {np.mean(loss_tot)}')

def test(model, datasets, criterion, device):
	model.eval()
	loss_tot = []
	score = []

	for step, (input_ids, labels) in enumerate(datasets):
		input_ids = input_ids.to(device)
		labels = labels.to(device).unsqueeze(1)

		logits = model(input_ids)
		loss = criterion(logits, labels)

		loss_tot.append(loss.item())

		# 로지스틱
		pre_y = torch.sigmoid(logits) > 0.5

		score.extend(
			torch.eq(pre_y, labels).cpu().tolist()
		)
    
	print(f'Val loss: {np.mean(loss_tot)}, Val Score: {np.mean(score)}')



In [10]:
epochs = 5
interval = 500

for epoch in range(epochs):
    print("="*30,{epoch},"="*30)
    train(classifier, train_loader, criterion, optimzier, device, interval)
    test(classifier, test_loader, criterion, device)
    print("="*70)
    


Train loss 0 : 0.6931707859039307
Train loss 500 : 0.6930916133279096
Train loss 1000 : 0.6784207920928101
Train loss 1500 : 0.6674440251756397
Train loss 2000 : 0.657576319844886
Train loss 2500 : 0.6419902373103799
Val loss: 0.5271316828628698, Val Score: 0.7374
Train loss 0 : 0.4230509102344513
Train loss 500 : 0.4979621397996853
Train loss 1000 : 0.48467893042526283
Train loss 1500 : 0.4711747458781662
Train loss 2000 : 0.46252894317400806
Train loss 2500 : 0.4562458096772897
Val loss: 0.41602776827998816, Val Score: 0.8028
Train loss 0 : 0.19948750734329224
Train loss 500 : 0.37600717329217526
Train loss 1000 : 0.37281208869430804
Train loss 1500 : 0.3734809644445827
Train loss 2000 : 0.3712198728512133
Train loss 2500 : 0.37051304665709534
Val loss: 0.38876364851435913, Val Score: 0.8154
Train loss 0 : 0.3124428987503052
Train loss 500 : 0.3067045787018454
Train loss 1000 : 0.3098963143696616
Train loss 1500 : 0.3110341376846509
Train loss 2000 : 0.3141649665440368
Train loss 250

In [11]:
# 학습된 모델로부터 임베딩 추출
token_to_embedding = dict()

embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb
    
token = vocab[10]
print(token, token_to_embedding[token])

을 [-7.2241910e-02 -1.2390603e+00  6.8603176e-01  1.5073179e-01
 -2.6981872e-01  1.6092555e-01  1.2608292e+00  4.9933285e-01
  2.8274119e-01  6.3064897e-01  1.4297815e-01 -1.2178882e+00
  1.5678551e+00  2.0204496e+00  1.3050552e-01 -3.5524830e-01
 -5.7229286e-01  7.1214777e-01  1.8999766e+00  2.0279989e-01
 -9.7408062e-03  3.7015226e-01 -1.2747694e+00  2.1945402e-01
  1.0166867e-01  2.7313355e-01  2.3684933e+00  4.0055183e-01
 -3.9525253e-01  7.3077828e-01 -5.3721541e-01 -1.1170708e+00
  2.7867234e-01 -1.6353542e-01 -6.9091338e-01 -6.8649596e-01
 -2.6942199e-01  1.4521776e+00 -1.5798107e+00  3.9709952e-02
  1.3587626e+00 -1.1555054e+00 -2.8701308e-01 -1.5596823e+00
  2.4744058e-01  3.6817703e-01 -2.4767101e-01 -5.5280006e-01
 -1.0111893e+00  9.9024512e-02 -2.3046599e-01 -4.0864196e-01
 -1.5590028e-01  4.5918244e-01 -3.4504193e-01  5.3708833e-01
 -1.4072609e+00  1.6119199e+00 -8.6443865e-01 -1.0234309e-01
 -3.8908565e-01 -3.2798180e-01 -1.5098411e+00  1.3166368e+00
 -6.8289988e-02  4.757

### 학습된 word2vec 임베딩 값으로 모델 학습
- p.333 

In [14]:
# 사전 학습된 모델로 임베딩 계층 초기화
from gensim.models import Word2Vec
import numpy as np

word2vec = Word2Vec.load("../models/word2vec.model")

init_embeddings = np.zeros( (n_vocab, embedding_dim))

for index, token in it_to_token.items():
    if token not in ["<pad>","<unk>"]:
        init_embeddings[index] = word2vec.wv[token]
        
embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)
    

In [17]:
# 사전 학습된 임베딩 계층 적용
class SentenceClassifier_Pre(nn.Module):
    def __init__(
            self,
            n_vocab, hidden_dim,			# n_vocab=> 단어사전 최대 길이
            embedding_dim,
            n_layers,
            dropout=0,
            bidirectinal=True,
            model_type="lstm",
            pretrained_embedding = None
	):
        super().__init__()
        
        if pretrained_embedding is not None:
            
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32)
			)

        else:
             
            self.embedding = nn.Embedding(
                   num_embeddings=n_vocab,
                   embedding_dim=embedding_dim,
                   padding_idx=0
		    )
        
        if model_type == "rnn":
            self.model == nn.RNN(
                input_size = embedding_dim,
                hidden_size= hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectinal,
                dropout=dropout,
                batch_first=True,
			)
            
        elif model_type=="lstm":
            self.model = nn.LSTM(
				input_size = embedding_dim,
                hidden_size= hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectinal,
                dropout=dropout,
                batch_first=True,
			)
        
		# 양방향학습 True
        if bidirectinal:
            self.classifier = nn.Linear(hidden_dim*2, 1)
             
        else:  
            self.classifier = nn.Linear(hidden_dim, 1)

        self.dropout = nn.Dropout(dropout)  
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _  = self.model(embeddings)
             
        last_output = output[:,-1,:]				# linear에 넣기 위한 Flatten
													# 마지막 시점만 결과만 분리해 분류기 계층 전달 
        last_output = self.dropout(last_output)
        
        logits = self.classifier(last_output)
        
        return logits

In [18]:
# 사전 학습된 임베딩을 사용한 모델 학습
classifier = SentenceClassifier_Pre(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim,
    n_layers=n_layers, pretrained_embedding = init_embeddings
).to(device)

criterion = nn.BCEWithLogitsLoss().to(device)
optimzier = optim.RMSprop(classifier.parameters(), lr=0.001)

epochs =5
interval = 500

for epoch in range(epochs):
    print("="*30,{epoch},"="*30)
    train(classifier, train_loader, criterion, optimzier, device, interval)
    test(classifier, test_loader, criterion, device)
    print("="*70)

Train loss 0 : 0.7004722356796265
Train loss 500 : 0.6583871606462254
Train loss 1000 : 0.6431397670037025
Train loss 1500 : 0.6287249683896992
Train loss 2000 : 0.6063951867660959
Train loss 2500 : 0.5904365351668647
Val loss: 0.49641471130017656, Val Score: 0.7616
Train loss 0 : 0.4719693064689636
Train loss 500 : 0.5040486927874788
Train loss 1000 : 0.49529849545105353
Train loss 1500 : 0.4905013521856502
Train loss 2000 : 0.486651747689016
Train loss 2500 : 0.48180876585875737
Val loss: 0.4493689206651986, Val Score: 0.786
Train loss 0 : 0.6565530300140381
Train loss 500 : 0.4527547110816438
Train loss 1000 : 0.45372317415791436
Train loss 1500 : 0.4497395373538365
Train loss 2000 : 0.44714319962135024
Train loss 2500 : 0.44595810623823856
Val loss: 0.4310065093227088, Val Score: 0.7996
Train loss 0 : 0.40444833040237427
Train loss 500 : 0.4174927618956851
Train loss 1000 : 0.4250700102000684
Train loss 1500 : 0.42984033189341675
Train loss 2000 : 0.430877105336169
Train loss 2500 