- p324 ~ p335

In [2]:
import pandas as pd
from Korpora import Korpora
from tabulate import tabulate

# 명령 프롬프트에서 가상환경 활성화 후 conda install tabulate 하기

- Data세트 불러오기

In [3]:
corpus = Korpora.load("nsmc")
corpus_df = pd.DataFrame(corpus.test)

train = corpus_df.sample(frac=0.9, random_state = 42)
test = corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print("Training Data Size : ", len(train))
print("Testing Data Size : ", len(test))


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\hwans\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\hwa

- Data 토큰화 및 단어사전 구축

In [4]:
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens) :
    counter = Counter()
    for tokens in corpus :
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab) :
        vocab.append(token)
    return vocab

tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab = 5000, special_tokens=["<pad>", "<unk>"])
token_to_id = {token : idx for idx, token in enumerate(vocab)}
id_to_token = {idx : token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


- 정수 인코딩 및 패딩

In [5]:
import numpy as np

def pad_sequences(sequences, max_length, pad_value) :
    result = list()
    for sequence in sequences :
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

# 숫자로 바꿔주는 ~
unk_id = token_to_id["<unk>"]
train_ids = [[token_to_id.get(token, unk_id) for token in review] for review in train_tokens]
test_ids = [[token_to_id.get(token, unk_id) for token in review] for review in test_tokens]

# 패딩
max_length = 32
pad_id = token_to_id["<pad>"]
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


- DataLoader 적용

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype = torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

  train_ids = torch.tensor(train_ids)
  test_ids = torch.tensor(test_ids)


- 문장 분류 모델

In [64]:
from torch import nn

class SentenceClassifier(nn.Module) :
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type="lstm", pretrained_embedding = None) :
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=n_vocab,
                                      embedding_dim = embedding_dim,
                                      padding_idx = 0)
        
        if model_type == "rnn" :
            self.model = nn.RNN(
                input_size = embedding_dim,
                hidden_size = hidden_dim,
                num_layers = n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first = True,
            )

        elif model_type == "lstm" :
            self.model = nn.LSTM(
                input_size = embedding_dim,
                hidden_size = hidden_dim,
                num_layers=n_layers,
                bidirectional = bidirectional,
                dropout=dropout,
                batch_first = True,
            )

        if pretrained_embedding is not None :
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype = torch.float32)
            )

        else:
            self.embedding = nn.Embedding(
                num_embedding=n_vocab,
                embedding_dim =embedding_dim,
                padding_idx = 0
            )

        if bidirectional :
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else :
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs) :
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

- 손실함수와 최적화 함수 정의

In [29]:
from torch import optim

n_vocab = len(token_to_id)
hidden_dim = 674
embedding_dim = 128
n_layers = 2

device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers
    ).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr = 0.001)

- 모델 학습 및 테스트

In [30]:
def train(model, datasets, criterion, optimizer, device, interval) :
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets) :
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step} : {np.mean(losses)}")

def test(model, datasets, criterion, device) :
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets) :
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")

epochs = 5
interval = 500

for epoch in range(epochs) :
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6928295493125916
Train Loss 500 : 0.7142991265374982
Train Loss 1000 : 0.7087508423940523
Train Loss 1500 : 0.7067495575354625
Train Loss 2000 : 0.7054704243037059
Train Loss 2500 : 0.7049807316539097
Val Loss : 0.714192312555953, Val Accuracy : 0.4822
Train Loss 0 : 0.6821919679641724
Train Loss 500 : 0.6989275076670085
Train Loss 1000 : 0.7003028777453092
Train Loss 1500 : 0.699967139327947
Train Loss 2000 : 0.7024520668668904
Train Loss 2500 : 0.7033344378999499
Val Loss : 0.6891534621723163, Val Accuracy : 0.5456
Train Loss 0 : 0.6555588841438293
Train Loss 500 : 0.6887910971027649
Train Loss 1000 : 0.6814906330673131
Train Loss 1500 : 0.6764612208796215
Train Loss 2000 : 0.6674276314366764
Train Loss 2500 : 0.656788858948875
Val Loss : 0.5640580684613115, Val Accuracy : 0.7188
Train Loss 0 : 0.5275339484214783
Train Loss 500 : 0.5282700491879515
Train Loss 1000 : 0.5171164729735711
Train Loss 1500 : 0.5065175536312635
Train Loss 2000 : 0.4985077992967818
Train Los

- 학습된 모델로부터 임베딩 추출

In [58]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix) :
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [ 0.00732145 -0.3356088   0.08656503 -0.03306498  0.23212002 -0.29337263
 -0.09231782 -0.03942776 -0.16306782  0.07997902 -0.02505183 -0.00750695
  0.06794622 -0.24104926  0.15246518  0.3419131   0.07585084  0.14669964
 -0.12381825  0.24512506 -0.01421452  0.25874284 -0.18326129 -0.3010084
 -0.1685668   0.14352842 -0.12448869  0.282141    0.05883038 -0.27367863
 -0.07821074  0.18770982 -0.02380344  0.17213023  0.12576792  0.41089207
  0.31010926 -0.08500806  0.02139693  0.06602467 -0.09702556  0.27963486
 -0.04602867 -0.13352565  0.40749404  0.05558844  0.02021302  0.04655577
  0.09889083  0.10674627  0.061408   -0.05395599  0.17052408  0.00915676
  0.09663107 -0.06783738  0.24049157  0.16517091 -0.13236412 -0.06434153
  0.33112514  0.05277485  0.24375214  0.02121567  0.25539294 -0.00287659
  0.23966342  0.16198406 -0.20413427 -0.2421456  -0.00624845 -0.1837253
 -0.42977405 -0.21224083  0.18130228 -0.16075855  0.01933512 -0.03536209
 -0.04387845  0.40699086 -0.0201764  -0.10097886

- 사전 학습된 모델로 임베딩 계층 초기화

- p289 예제 6-4 + p297 예제 6-13

In [59]:
# 프로픔트에서 가상환경 선택하셔서 conda install gensim 해주셔야 해용 
from gensim.models import Word2Vec
from konlpy.tag import Okt

tokenizer = Okt()
tokens = [tokenizer.morphs(review) for review in corpus_df.text]

word2vec = Word2Vec(
    sentences = tokens,
    vector_size = 128,
    window = 5,
    min_count = 1,
    sg = 1,
    epochs = 3,
    max_final_vocab = 10000
)
word2vec.save("../models/word2vec.model")

In [60]:
word2vec = Word2Vec.load("../models/word2vec.model")

init_embeddings = np.zeros((n_vocab, embedding_dim))

for index, token in id_to_token.items() :
    if token not in ["<pad>", "<unk>"] :
        init_embeddings[index] = word2vec.wv[token]

embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)

- 사전 학습된 임베딩 계층 적용

In [61]:
# class SentenceClassifier(nn.Module) :
#     def __init__(
#             self,
#             pretrained_embedding = None
#     ) :
        
#         if pretrained_embedding is not None :
#             self.embedding = nn.Embedding.from_pretrained(
#                 torch.tensor(pretrained_embedding, dtype = torch.float32)
#             )
#         else:
#             self.embedding = nn.Emedding(
#                 num_embedding=n_vocab,
#                 embedding_dim =embedding_dim,
#                 padding_idx = 0
#             )

- 사전 학습된 임베딩을 사용한 모델 학습

In [67]:
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim = hidden_dim, embedding_dim = embedding_dim,
    n_layers=n_layers, pretrained_embedding=init_embeddings
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr = 0.001)

epochs = 5
interval = 500

for epoch in range(epochs) :
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6923500895500183
Train Loss 500 : 0.7204446342890848
Train Loss 1000 : 0.7105973306950275
Train Loss 1500 : 0.7071287550980214
Train Loss 2000 : 0.7047542050920207
Train Loss 2500 : 0.7038773179578571
Val Loss : 0.6534613993602058, Val Accuracy : 0.635
Train Loss 0 : 0.8273802399635315
Train Loss 500 : 0.6696390861760595
Train Loss 1000 : 0.6692885815144538
Train Loss 1500 : 0.6530790386757479
Train Loss 2000 : 0.617955075501502
Train Loss 2500 : 0.5930466720672761
Val Loss : 0.45289477763084557, Val Accuracy : 0.784
Train Loss 0 : 0.6053240895271301
Train Loss 500 : 0.46431599191562856
Train Loss 1000 : 0.4564578697665945
Train Loss 1500 : 0.45644596096954687
Train Loss 2000 : 0.4544312585385396
Train Loss 2500 : 0.4513419313934959
Val Loss : 0.44205117239929237, Val Accuracy : 0.78
Train Loss 0 : 0.4299277067184448
Train Loss 500 : 0.42384274699492847
Train Loss 1000 : 0.4285786613360509
Train Loss 1500 : 0.4420131941007662
Train Loss 2000 : 0.4734265914429789
Train 