##### - 예제 6.20 문장 분류 모델

In [1]:
from torch import nn

In [2]:
class SentenceClassifier(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type = "lstm"):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0)

        if model_type == 'rnn':
            self.model = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        elif model_type == 'lstm':
            self.model = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else :
            self.classifier = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output,_ = self.model(embeddings)
        last_output = output[:,-1,:]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

##### - 예제 6.21 데이터세트 불러오기

In [3]:
import pandas as pd
from Korpora import Korpora

In [4]:
corpus = Korpora.load("nsmc")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\kdp\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\kdp\K

In [5]:
corpus_df = pd.DataFrame(corpus.test)

train = corpus_df.sample(frac=0.9, random_state=42)
test = corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print("Training Data Size :", len(train))
print("Testing Data Size :", len(test))

|       | text                                                                                     |   label |
|------:|:-----------------------------------------------------------------------------------------|--------:|
| 33553 | 모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. |       1 |
|  9427 | 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...                                    |       0 |
|   199 | 신날 것 없는 애니.                                                                       |       0 |
| 12447 | 잔잔 격동                                                                                |       1 |
| 39489 | 오랜만에 찾은 주말의 명화의 보석                                                         |       1 |
Training Data Size : 45000
Testing Data Size : 5000


##### - 예제 6.22 데이터 토큰화 및 단어사전 구축

In [6]:
from konlpy.tag import Okt
from collections import Counter

In [7]:
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens

    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
        
    return vocab

In [8]:
tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=['<pad>','<unk>'])
token_to_id = {token:idx for idx, token in enumerate(vocab)}
id_to_token = {idx:token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


##### - 예제 6.23 정수 인코딩 및 패딩

In [9]:
import numpy as np

In [10]:
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [11]:
unk_id = token_to_id["<unk>"]
train_ids = [[token_to_id.get(token, unk_id) for token in review] for review in train_tokens]
test_ids = [[token_to_id.get(token, unk_id) for token in review] for review in test_tokens]

max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


##### - 예제 6.24 데이터로더 적용

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset

In [13]:
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

##### - 예제 6.25 손실 함수와 최적화 함수 정의

In [14]:
from torch import optim

In [15]:
n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers = 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

classifier = SentenceClassifier(n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

##### - 예제 6.26 모델 학습 및 테스트

In [16]:
def train(model, datasets, criterion, optimizer, device, interval):
    model.train
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step% interval == 0:
            print(f'Train Loss {step} : {np.mean(losses)}')

In [17]:
def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")    

In [18]:
epochs = 5
interval = 500
for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6888815760612488
Train Loss 500 : 0.6931502986810878
Train Loss 1000 : 0.6901218803910227
Train Loss 1500 : 0.6781182200570333
Train Loss 2000 : 0.6644962208828648
Train Loss 2500 : 0.6522199264863452
Val Loss : 0.6108439846541553, Val Accuracy : 0.6884
Train Loss 0 : 0.8223201036453247
Train Loss 500 : 0.5862487866255099
Train Loss 1000 : 0.5823909821031572
Train Loss 1500 : 0.5671709331768818
Train Loss 2000 : 0.5516468487132614
Train Loss 2500 : 0.5425152145936841
Val Loss : 0.4797749638367004, Val Accuracy : 0.762
Train Loss 0 : 0.49028337001800537
Train Loss 500 : 0.43355464212551803
Train Loss 1000 : 0.4273923209288737
Train Loss 1500 : 0.41920397382510016
Train Loss 2000 : 0.415467390651884
Train Loss 2500 : 0.412370437243041
Val Loss : 0.41123676133422427, Val Accuracy : 0.8096
Train Loss 0 : 0.27190765738487244
Train Loss 500 : 0.3520782297986472
Train Loss 1000 : 0.35112087563290584
Train Loss 1500 : 0.3514702291160246
Train Loss 2000 : 0.35192403873835487
Tr

##### - 예제 6.27 학습된 모델로부터 임베딩 추출

In [19]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [-1.2027123e+00 -2.8343022e-01  1.1898317e-01  6.6054815e-01
 -9.0084985e-02 -2.5723609e-01  6.0855931e-01 -7.0697689e-01
 -9.3626845e-01 -1.7160279e-01  9.8972571e-01 -7.4839994e-02
  5.8113891e-01  3.5011408e-01  5.5509436e-01 -1.0084178e+00
 -4.7861651e-02  4.5484770e-02 -1.1176127e+00 -1.1153077e+00
 -5.5946231e-01  1.6609976e-01 -6.9103360e-01 -4.2102665e-01
 -1.0999032e+00 -1.0133801e+00 -1.1765000e+00  1.2447888e+00
 -1.0823756e+00 -2.0020647e+00 -9.9515641e-01 -3.5422200e-01
  1.2760500e+00  4.9898720e-01  8.1064677e-01  4.4753474e-01
 -6.3663775e-01 -1.5238013e+00 -1.1860636e+00  1.5088710e+00
 -3.1907752e-01  6.8043464e-01 -7.0949972e-01  6.6307998e-01
  2.8670449e+00  2.0986512e+00 -7.9281801e-01 -2.3657794e-01
 -1.3676692e+00  6.4030433e-01  7.5306678e-01 -3.3508134e-01
 -1.3501488e+00  4.0606800e-01  4.1135150e-01  1.4637607e+00
 -3.7260085e-02  1.9022252e-01  7.9558986e-01 -5.4282194e-01
 -2.6914645e-03  1.5162492e-01  2.1979239e+00  1.5191389e+00
 -1.8863884e+00  9.

##### - 예제 6.4 영화 리뷰 데이터세트 전처리

In [22]:
corpus = Korpora.load("nsmc")
corpus = pd.DataFrame(corpus.test)

tokenizer = Okt()
tokens = [tokenizer.morphs(review) for review in corpus.text]
print(tokens[:3])


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\kdp\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\kdp\K

##### - 예제 6.13 Word2Vec 모델 학습

In [20]:
from gensim.models import Word2Vec

In [23]:
word2vec = Word2Vec(sentences=tokens, vector_size=128, window=5, min_count=1, sg=1, epochs=3,max_final_vocab=10000)

In [25]:
#  word2vec.save('../models/word2vec.model')

In [26]:
# word2vec = Word2Vec.load("../models/word2vec.model")

##### - 예제 6.28 사전 학습된 모델로 임베딩 계층 초기화

In [27]:
word2vec = Word2Vec.load("../models/word2vec.model")
init_embeddings = np.zeros((n_vocab, embedding_dim))

for index, token in id_to_token.items():
    if token not in ["<pad>", "<unk>"]:
        init_embeddings[index] = word2vec.wv[token]

embedding_layer = nn.Embedding.from_pretrained(torch.tensor(init_embeddings, dtype=torch.float32))

##### - 예제 6.29 사전 학습된 임베딩 계층 적용

In [28]:
class SentenceClassifier(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type = "lstm", pretrained_embedding = None):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0)

        if model_type == 'rnn':
            self.model = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        elif model_type == 'lstm':
            self.model = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else :
            self.classifier = nn.Linear(hidden_dim, 1)
        
        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(pretrained_embedding, dtype=torch.float32))
        else :
            self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0)

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output,_ = self.model(embeddings)
        last_output = output[:,-1,:]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [29]:
classifier = SentenceClassifier(n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers, pretrained_embedding=init_embeddings).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [30]:
epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6780356764793396
Train Loss 500 : 0.6215563155338911
Train Loss 1000 : 0.5661759556054354
Train Loss 1500 : 0.5479313805748985
Train Loss 2000 : 0.5336774771866472
Train Loss 2500 : 0.5264726846647567
Val Loss : 0.46545036694112296, Val Accuracy : 0.789
Train Loss 0 : 0.68622887134552
Train Loss 500 : 0.4692934120367625
Train Loss 1000 : 0.4692014511618819
Train Loss 1500 : 0.46794953476699647
Train Loss 2000 : 0.46592387885436126
Train Loss 2500 : 0.4646260380530443
Val Loss : 0.4610077575468027, Val Accuracy : 0.777
Train Loss 0 : 0.43489745259284973
Train Loss 500 : 0.46049108896069896
Train Loss 1000 : 0.4548237497632677
Train Loss 1500 : 0.45092937493228974
Train Loss 2000 : 0.4480627098124007
Train Loss 2500 : 0.44576264453715964
Val Loss : 0.4292341021779246, Val Accuracy : 0.8026
Train Loss 0 : 0.2730364501476288
Train Loss 500 : 0.442234830421483
Train Loss 1000 : 0.43690549838286896
Train Loss 1500 : 0.43268046691408163
Train Loss 2000 : 0.43083471895708914
T