# torch `nn.Embedding`

In [1]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 사전학습된 임베딩을 사용하지 않는 경우

In [2]:
sentences = [          
    'nice great best amazing',  # 긍정 문장 예시
    'stop lies',                # 부정/비판 문장 예시
    'pitiful nerd',             # 부정 문장 예시
    'excellent work',           # 긍정 문장 예시
    'supreme quality',          # 긍정 문장 예시
    'bad',                      # 부정 문장 예시
    'highly respectable'        # 긍정 문장 예시
]                               # 분류 모델에 넣을 입력 문장 리스트(list[str])
labels = [1, 0, 0, 1, 1, 0, 1]  # 각 문장에 대한 이진 라벨(1=긍정, 0=부정)

In [3]:
# 토큰화
from nltk.tokenize import word_tokenize

tokenized_sentences = [word_tokenize(sent) for sent in sentences]    # 긱 믄장을 토큰리스트(list(list[str]))로 변환
tokenized_sentences

[['nice', 'great', 'best', 'amazing'],
 ['stop', 'lies'],
 ['pitiful', 'nerd'],
 ['excellent', 'work'],
 ['supreme', 'quality'],
 ['bad'],
 ['highly', 'respectable']]

In [4]:
# 단어 사전 생성 + 정수 인쾽
from collections import Counter

tokens = [token for sent in tokenized_sentences for token in sent]   # 문장리스트를 평탄화하여 전체 토큰 리스트 생성
word_counts = Counter(tokens)
print(word_counts)

word_to_index = {word:index +2 for index,word in enumerate(tokens)}
word_to_index['<PAD>']=0    # 패딩토큰(길이맞추기)
word_to_index['<UNK>']=1    # OOV토큰(처리불가 단어 대체)
word_to_index = dict(sorted(word_to_index.items(),key=lambda x:x[1]))   # 인덱스를 기준으로 정렬
print(word_to_index)    # 단어 -> 인덱스 사전

vocab_size = len(word_to_index) # 전체 어휘 수 (특수토큰 포함)
vocab_size

Counter({'nice': 1, 'great': 1, 'best': 1, 'amazing': 1, 'stop': 1, 'lies': 1, 'pitiful': 1, 'nerd': 1, 'excellent': 1, 'work': 1, 'supreme': 1, 'quality': 1, 'bad': 1, 'highly': 1, 'respectable': 1})
{'<PAD>': 0, '<UNK>': 1, 'nice': 2, 'great': 3, 'best': 4, 'amazing': 5, 'stop': 6, 'lies': 7, 'pitiful': 8, 'nerd': 9, 'excellent': 10, 'work': 11, 'supreme': 12, 'quality': 13, 'bad': 14, 'highly': 15, 'respectable': 16}


17

In [5]:
# 정수 인코딩 함수 : 토큰화된 문장 리스트를 단어 -> 인덱스 사전으로 정수 시퀀스 (list[list(int)])로 변환
def texts_to_sequence(sentences, word_to_index):
    sequences = []
    
    for sent in sentences:
        sequence = []
        
        for token in sent:
            if token in word_to_index:
                sequence.append(word_to_index[token])
            else :
                sequence.append(word_to_index['<UNK>'])
        
        sequences.append(sequence)
        
    return sequences

sequences = texts_to_sequence(tokenized_sentences, word_to_index)
sentences

['nice great best amazing',
 'stop lies',
 'pitiful nerd',
 'excellent work',
 'supreme quality',
 'bad',
 'highly respectable']

In [None]:
import numpy as np

# 서로 다른 길이의 정수 시퀀스를 0(<PAD>)으로 채우거나 잘라내  (문장수, maxlen) 형태로 맞춰주는 함수
def pad_sequnces(sentences, maxlen):
    padded_sequences = np.zeros((len(sentences),maxlen),dtype=int)  # (문장수 x maxlen)크기의 0 패딩 배열
    
    for index, seq in enumerate(sequences): # 각 문장 시퀀스 순회
        padded_sequences[index, :len(seq)] = seq[:maxlen]   # 앞에서부터 시퀀스 채운다. 길면 maxlen까지만 채워 자른다.
        
    return padded_sequences

padded_sequnces = pad_sequnces(sequences,maxlen=4)  # 모든 문장 길이 4로 패딩/자르기
padded_sequnces # (문장 수,5) 형태

array([[ 2,  3,  4,  5],
       [ 6,  7,  0,  0],
       [ 8,  9,  0,  0],
       [10, 11,  0,  0],
       [12, 13,  0,  0],
       [14,  0,  0,  0],
       [15, 16,  0,  0]])

In [7]:
padded_sequnces.shape

(7, 4)

In [None]:
# Pytorch 텍스트 분류 모델 : Embedding + RNN + Linear로 이진 분류(logit) 출력
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class SimpleNet(nn.Module):
    # 정수 시퀀스를 임베딩 -> RNN -> 선형층으로 처리해 이진 분류 logit(1개)ㄹㄹ 출력
    def __init__(self,vocab_size,embedding_dim,hidden_size):
        super().__init__()
        self.embedding=nn.Embedding(        # 단어 ID를 밀집 벡터로 변환하는 임베딩 층
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,    # 임베딩 차원
            padding_idx=0
        )
        self.rnn = nn.RNN(embedding_dim,hidden_size,batch_first=True)   # 입력(배치,길이.차원) 형태의 RNN
        self.out = nn.Linear(hidden_size,1)     # 맞막 은닉 상태를 1차원 logit으로 변환
        
    def forward(self,x):
        embedded = self.embedding(x)    # (batch, seq_len) -> (batch, seq_len, embedding_dim)
        out, h_n = self.rnn(embedded)   # h_n: (num_layers*directions, batch, hidden_size)
        out = self.out(h_n.squeeze(0))  # (batch_size, hidden_size) -> (batch,1)
        return out  # 출력 : 시그모이드 전 logit(확률이 아님)
    
embedding_dim = 100
model = SimpleNet(vocab_size,embedding_dim,hidden_size=16)  # 어휘크기/임베딩차원/은닉크기로 모델 생성
model

SimpleNet(
  (embedding): Embedding(17, 100, padding_idx=0)
  (rnn): RNN(100, 16, batch_first=True)
  (out): Linear(in_features=16, out_features=1, bias=True)
)

In [10]:
%pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
from torchinfo import summary   # 모델 구조를 표 형태로 요약

summary(model)  # model의 레이어 구성 / 파라미터 수를 요약

Layer (type:depth-idx)                   Param #
SimpleNet                                --
├─Embedding: 1-1                         1,700
├─RNN: 1-2                               1,888
├─Linear: 1-3                            17
Total params: 3,605
Trainable params: 3,605
Non-trainable params: 0

In [13]:
# 임베딩 가중치 확인 : 학습 전/후 Embedding 테이블과 단어별 벡터 조회
import pandas as pd

# 학습 전 임벧ㅇ 벡터
wv = model.embedding.weight.data    # Embedding 층의 가중치 행렬(단어ID x 임베딩 차원) 추출
print(wv.shape) # (vocab_size, embedding_dim)

# 특정 단어 벡터
vocab = word_to_index.keys()    # 단어사전에서 단어만 뽑아온다.
pd.DataFrame(wv,index=vocab)    

torch.Size([17, 100])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
<PAD>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<UNK>,-1.735707,-0.338836,0.38756,1.822761,1.180889,0.029844,-1.454998,0.928572,-0.157014,-0.841594,...,2.005783,0.139943,-0.994724,-1.118804,1.153105,-0.068381,0.769769,-0.516143,1.574835,-0.292705
nice,0.475319,-0.317899,0.765528,-0.052871,-0.705285,-0.378955,-0.748233,0.525564,0.641379,-0.454036,...,-1.955102,-0.937681,-0.16907,0.885088,-1.246579,0.048523,-0.445946,0.404705,0.059955,-1.336465
great,0.499201,-0.715669,0.121338,-0.448419,-0.06634,1.636819,-0.290606,-0.834929,0.167324,-0.273061,...,-1.352499,1.635663,0.460028,-0.822666,-0.853711,-0.232818,1.406272,-0.409277,-3.135439,0.809925
best,0.019643,1.520321,1.153095,-2.111665,-0.581775,0.603003,0.829944,-0.297855,1.255378,0.046525,...,-0.804685,0.860246,-2.392415,0.597881,1.046835,-0.689517,0.501067,-0.220726,1.172062,-0.7657
amazing,-0.528966,0.430008,0.344425,-1.987139,-1.231478,-0.105404,0.144557,-0.80639,0.652106,1.586295,...,-0.255125,0.879363,-0.357278,-0.204549,0.989971,-1.906618,-0.715281,-0.508421,0.664267,1.338793
stop,0.313743,1.578746,1.369931,-2.099982,0.089595,0.402788,0.245792,-1.682907,-0.575671,0.194547,...,-1.122362,-1.825266,0.916183,-0.624366,1.730734,1.245013,-0.740861,-0.352895,0.413123,0.83581
lies,-0.717076,-0.107237,1.90908,-0.956966,0.256004,-0.802785,-0.102114,1.81822,-1.332305,-0.597572,...,0.479183,-0.414356,0.223169,-1.400546,1.207151,0.776599,-0.996237,0.711294,0.116161,0.337716
pitiful,0.83296,1.364188,1.54045,1.477358,-1.342322,0.31026,-0.095891,-0.227439,-0.187051,-1.081465,...,0.064858,1.773807,-1.998466,0.394613,0.13913,1.054239,-0.806519,1.227212,-1.568208,3.540898
nerd,1.308359,0.048889,1.379176,-0.112046,-0.794457,0.633167,0.12052,-0.288433,1.718499,-0.960887,...,-0.31448,-0.817938,1.107119,0.017147,0.990949,0.505777,-1.218458,1.140192,-0.387397,-0.013644


In [16]:
# Pytorch 학습 준비 : 텐서 변환 -> DataLoader 구성 -> 손실함수/옵티마이저 설정
X = torch.tensor(padded_sequnces, dtype=torch.long)         # 입력 시퀀스(정수 ID)를 LongTensor로 변환
y = torch.tensor(labels, dtype=torch.float).unsqueeze(1)    # 라벨을 float으로 변환 후 (N,) -> (N,1)로 차원 맞춤

dataset = TensorDataset(X,y)
dataLoader = DataLoader(dataset, batch_size=2, shuffle=True)

criterion=nn.BCEWithLogitsLoss()                       # 출력 logit과 정답(0/1)로 이진분류 손실 계산(시그모이드 포함)
optimizer=optim.Adam(model.parameters(),lr=0.005)

BCEWithLogitsLoss을 사용할 떄에는 모델 출력이 sigmoid를 거치지않은 logit이어야 한다

In [None]:
# 학습루프 : 미니배치 단위로 20 epoch 학습하여 평균 손실 출력
for epoch in range(20):
    epoch_loss=0    # 손실 누적
    
    for x_batch, y_batch in dataLoader:
        optimizer.zero_grad()
        output = model(x_batch)             # 순전파로 logit 계ㅏㄴ
        loss = criterion(output, y_batch)   # 예측LOGIT과 정답으로 손실 계산
        loss.backward()                     # 역전파로 기울기 계산
        optimizer.step()                    # 파라미터 업뎅트
        
        epoch_loss += loss.item()   # 배치손실을 float로 누적
        
    print(f"Epoch {epoch+1}: loss {epoch_loss / len(dataLoader)}")  # epoch별 편균 손실 계산

Epoch 1: loss 0.6482200622558594
Epoch 2: loss 0.5331729203462601
Epoch 3: loss 0.4244436025619507
Epoch 4: loss 0.3081277571618557
Epoch 5: loss 0.21190485171973705
Epoch 6: loss 0.1461825855076313
Epoch 7: loss 0.10756273940205574
Epoch 8: loss 0.08008461818099022
Epoch 9: loss 0.062396758235991
Epoch 10: loss 0.04782864544540644
Epoch 11: loss 0.03916932921856642
Epoch 12: loss 0.0340281967073679
Epoch 13: loss 0.02790939388796687
Epoch 14: loss 0.02527895336970687
Epoch 15: loss 0.02127641998231411
Epoch 16: loss 0.018977869069203734
Epoch 17: loss 0.017061745980754495
Epoch 18: loss 0.0156174311414361
Epoch 19: loss 0.014366717310622334
Epoch 20: loss 0.013972284272313118


In [None]:
# 평가 / 예측 : 학습된 모델로 확률 -> 0/1 예측값 생성 후 정답과 비교
model.eval()
with torch.no_grad():               # 기울기 계산 비활성화
    output = model(X)               # 전체 샘플에 대한 예측 logit 계산
    prob = torch.sigmoid(output)    # logit에 0~1 확률로 계산
    pred = (prob >= 0.5).int()      # 임계값 0.5 기준으로 이진 분류(0/1) 예측값 생성
    
print(labels)
print(pred.squeeze().detach().numpy())  # 예측라벨을 1차원 numpy 배열로 변환

[1, 0, 0, 1, 1, 0, 1]
[1 0 0 1 1 0 1]


## 사전학습된 임베딩을 사용하는 경우

In [22]:
from gensim.models import KeyedVectors

model_wv = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
model_wv.vectors.shape

(3000000, 300)

In [24]:
# 임베딩 매트릭스 초기화 : 사전학습 벡터로 Embedding 레이어를 채우기 위한 준비
print(len(word_to_index))

# (vocab_size, embedding_dim) 크기의 0 행렬 생성
embedding_matrix = np.zeros((len(word_to_index), model_wv.vectors.shape[1]))
print(embedding_matrix.shape)

17
(17, 300)


In [31]:
# 사전학습된 임베딩 매핑 : 내 단어사전을 GoogleNews 벡터로 채워 embedding_matrix 구성
# model_wv.key_to_index['bad']    # 'bad'의 내부 인덱스 확인(706)
# model_wv.vectors[240]

# 단어가 사전학습 모델에 있으면 임베딩 벡터(np.ndarray)를 반환, 없으면 None 반환
def get_word_embedding(word):
    if word in model_wv:        # 사전학습 단어가 존재하면
        return model_wv[word]   # 해당 단어 임베딩 벡터 반환
    else:
        return None
# get_word_embedding('bad')
for word, index in word_to_index.items():   # 내 단어사전(단어 -> 인덱스)를 순회
    if index >= 2:                          # 특수토큰 제외
        emb = get_word_embedding(word)      # 사전학습 임베딩ㅇ서 해당 단어 벡터 조회
        if emb is not None:                 # 벡터가 존재하면
            embedding_matrix[index] = emb   # 내 인덱스 위치에 사전학습 벡터를 복사해서 채운다.

In [32]:
pd.DataFrame(embedding_matrix, index=word_to_index.keys())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
<PAD>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<UNK>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nice,0.158203,0.105957,-0.189453,0.386719,0.083496,-0.267578,0.083496,0.113281,-0.104004,0.178711,...,-0.085449,0.189453,-0.146484,0.134766,-0.040771,0.032715,0.089355,-0.267578,0.008362,-0.213867
great,0.071777,0.208008,-0.028442,0.178711,0.132812,-0.099609,0.096191,-0.116699,-0.008545,0.148438,...,-0.011475,0.064453,-0.289062,-0.048096,-0.199219,-0.071289,0.064453,-0.167969,-0.020874,-0.142578
best,-0.126953,0.021973,0.287109,0.15332,0.12793,0.032715,-0.115723,-0.029541,0.15332,0.011292,...,0.006439,-0.033936,-0.166016,-0.016846,-0.048584,-0.022827,-0.152344,-0.101562,-0.090332,0.088379
amazing,0.07373,0.004059,-0.135742,0.022095,0.180664,-0.046631,0.224609,-0.229492,-0.040039,0.225586,...,0.018433,-0.02124,-0.25,-0.020142,-0.310547,-0.207031,-0.006317,-0.141602,-0.150391,-0.137695
stop,-0.057861,0.013184,0.115234,0.069824,-0.306641,-0.044678,0.048584,0.152344,0.073242,-0.100098,...,0.100098,0.171875,-0.113281,0.064453,-0.115723,0.048096,-0.004822,0.086426,0.029907,0.007812
lies,0.149414,-0.012817,0.328125,0.025513,0.017334,0.19043,0.188477,-0.143555,-0.09082,0.206055,...,-0.308594,0.183594,-0.202148,0.031494,-0.164062,-0.201172,0.080078,-0.105469,0.149414,0.157227
pitiful,0.269531,0.253906,-0.020996,0.060303,-0.010925,0.217773,0.139648,-0.057617,0.3125,0.253906,...,-0.063477,0.132812,-0.094238,0.089355,-0.06543,-0.016235,-0.10791,-0.072266,-0.094238,0.028809
nerd,0.265625,-0.207031,-0.026611,0.419922,-0.208984,0.390625,0.164062,0.063965,0.149414,-0.0177,...,0.21582,0.125,-0.227539,-0.310547,-0.112793,-0.09668,0.255859,0.124023,-0.030273,0.082031


In [34]:
# Pytorch 텍스트 분류 모델 : Embedding + RNN + Linear로 이진 분류(logit) 출력
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class SimpleNet(nn.Module):
    # 정수 시퀀스를 임베딩 -> RNN -> 선형층으로 처리해 이진 분류 logit(1개)ㄹㄹ 출력
    def __init__(self,vocab_size,embedding_dim,hidden_size):
        super().__init__()
        self.embedding=nn.Embedding(        # 단어 ID를 밀집 벡터로 변환하는 임베딩 층
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,    # 임베딩 차원
            padding_idx=0
        )
        
        # 사전학습된 임베딩 벡터로 chrlghk : Embedding 가중치를 사전학습 행렬로 덮어쓰기
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float))
        
        self.rnn = nn.RNN(embedding_dim,hidden_size,batch_first=True)   # 입력(배치,길이.차원) 형태의 RNN
        self.out = nn.Linear(hidden_size,1)     # 맞막 은닉 상태를 1차원 logit으로 변환
        
    def forward(self,x):
        embedded = self.embedding(x)    # (batch, seq_len) -> (batch, seq_len, embedding_dim)
        out, h_n = self.rnn(embedded)   # h_n: (num_layers*directions, batch, hidden_size)
        out = self.out(h_n.squeeze(0))  # (batch_size, hidden_size) -> (batch,1)
        return out  # 출력 : 시그모이드 전 logit(확률이 아님)
    
embedding_dim = model_wv.vectors.shape[1]   # 사전학습 임베딩 차원 (300)으로 임베딩 차원 설정
model = SimpleNet(vocab_size,embedding_dim,hidden_size=16)  # 어휘크기/임베딩차원/은닉크기로 모델 생성
print(model)

criterion=nn.BCEWithLogitsLoss()                       # 출력 logit과 정답(0/1)로 이진분류 손실 계산(시그모이드 포함)
optimizer=optim.Adam(model.parameters(),lr=0.005)

SimpleNet(
  (embedding): Embedding(17, 300, padding_idx=0)
  (rnn): RNN(300, 16, batch_first=True)
  (out): Linear(in_features=16, out_features=1, bias=True)
)


In [35]:
# 학습루프 : 미니배치 단위로 20 epoch 학습하여 평균 손실 출력
for epoch in range(20):
    epoch_loss=0    # 손실 누적
    
    for x_batch, y_batch in dataLoader:
        optimizer.zero_grad()
        output = model(x_batch)             # 순전파로 logit 계ㅏㄴ
        loss = criterion(output, y_batch)   # 예측LOGIT과 정답으로 손실 계산
        loss.backward()                     # 역전파로 기울기 계산
        optimizer.step()                    # 파라미터 업뎅트
        
        epoch_loss += loss.item()   # 배치손실을 float로 누적
        
    print(f"Epoch {epoch+1}: loss {epoch_loss / len(dataLoader)}")  # epoch별 편균 손실 계산

Epoch 1: loss 0.7069666683673859
Epoch 2: loss 0.5465743914246559
Epoch 3: loss 0.5055489987134933
Epoch 4: loss 0.39554160088300705
Epoch 5: loss 0.2865040823817253
Epoch 6: loss 0.22097426652908325
Epoch 7: loss 0.16070615500211716
Epoch 8: loss 0.11651797406375408
Epoch 9: loss 0.08373840525746346
Epoch 10: loss 0.06633596867322922
Epoch 11: loss 0.05124673433601856
Epoch 12: loss 0.04191670008003712
Epoch 13: loss 0.032123691868036985
Epoch 14: loss 0.028226724360138178
Epoch 15: loss 0.023623700253665447
Epoch 16: loss 0.021075201220810413
Epoch 17: loss 0.01825667219236493
Epoch 18: loss 0.016674587270244956
Epoch 19: loss 0.014839658979326487
Epoch 20: loss 0.013508475618436933


In [36]:
# 평가 / 예측 : 학습된 모델로 확률 -> 0/1 예측값 생성 후 정답과 비교
model.eval()
with torch.no_grad():               # 기울기 계산 비활성화
    output = model(X)               # 전체 샘플에 대한 예측 logit 계산
    prob = torch.sigmoid(output)    # logit에 0~1 확률로 계산
    pred = (prob >= 0.5).int()      # 임계값 0.5 기준으로 이진 분류(0/1) 예측값 생성
    
print(labels)
print(pred.squeeze().detach().numpy())  # 예측라벨을 1차원 numpy 배열로 변환

[1, 0, 0, 1, 1, 0, 1]
[1 0 0 1 1 0 1]


사전학습 임베딩을 사용했을 때에도 학습 데이터가 분류가 잘 되는지 파악한다.  
만약 틀린 샘플이 있다면 해당 문장이 OOV(0벡터) 비중이 큰지 확인해봐야한다.