# 4.1 NPLM (Neural Probability Language Model)

NPLM을 파이토치로 구현해보자!

<img src = 'images/NPLM.png'>

In [310]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [43]:
N = 4 # n-gram의 갯수
VEC_DIM = 100 # 임베딩 차원
HIDDEN_DIM = 100 # 은닉 차원

### 먼저 데이터를 벡터화부터 시작하자.

In [3]:
txt_path = 'data/tokenized/korquad_mecab.txt'
corpus = [sent.strip().split() for sent in open(txt_path, 'r', encoding='utf-8').readlines()]

In [4]:
for word in corpus[1]:
    print(word)

바그너
는
괴테
의
파우스트
를
읽
고
무엇
을
쓰
고자
했
는가
?
교향곡


In [5]:
def sent_to_ngram(n, sent):
    ngrams = []
    
    for i in range(len(sent)-n+1):
        ngram = []
        for j in range(n):
            ngram.append(sent[i+j])
        ngrams.append(ngram)
            
    return ngrams

In [14]:
sent_to_ngram(N, corpus[1])

[['바그너', '는', '괴테', '의'],
 ['는', '괴테', '의', '파우스트'],
 ['괴테', '의', '파우스트', '를'],
 ['의', '파우스트', '를', '읽'],
 ['파우스트', '를', '읽', '고'],
 ['를', '읽', '고', '무엇'],
 ['읽', '고', '무엇', '을'],
 ['고', '무엇', '을', '쓰'],
 ['무엇', '을', '쓰', '고자'],
 ['을', '쓰', '고자', '했'],
 ['쓰', '고자', '했', '는가'],
 ['고자', '했', '는가', '?'],
 ['했', '는가', '?', '교향곡']]

단어장과 인덱스를 만들자

In [10]:
vocab = set()
for i in range(len(corpus)):
    vocab.update(corpus[i])
    
len(vocab)

In [17]:
word_to_id = dict()
for word in vocab:
    word_to_id[word]=len(word_to_id)    

len(word_to_id)

83593

In [31]:
word_to_id

{'Deluge': 0,
 '정충': 1,
 '長': 2,
 'affinity': 3,
 '화질': 4,
 '총감독': 5,
 '辰': 6,
 '생겼': 7,
 '관조': 8,
 '恤': 9,
 '나폴레옹': 10,
 '고위직': 11,
 '엔네아드': 12,
 '이타야': 13,
 '포켓형': 14,
 '안치': 15,
 '방해죄': 16,
 '묻': 17,
 '쓰러뜨렸으나': 18,
 '반짝이': 19,
 '입장객': 20,
 '코핀': 21,
 '보슬리': 22,
 '매슈': 23,
 'Treblinka': 24,
 '베스타': 25,
 '소카': 26,
 '두르가': 27,
 '김윤미': 28,
 '타르지': 29,
 '최인': 30,
 '콘코': 31,
 '히로시': 32,
 '한효주': 33,
 '하코트': 34,
 '불러온다': 35,
 '에마누엘레': 36,
 '수파': 37,
 '주원인': 38,
 '레이지': 39,
 '천대': 40,
 '그서': 41,
 '청담': 42,
 '불타오르': 43,
 '냄새샘': 44,
 '페널티킥': 45,
 '권문세족': 46,
 '영포': 47,
 '봄베이': 48,
 '영영': 49,
 '아리안나': 50,
 '서유견문': 51,
 '이이다': 52,
 '국자': 53,
 '축대': 54,
 '노란': 55,
 '>(': 56,
 '이시아': 57,
 '부사': 58,
 '愁城': 59,
 '균형발전': 60,
 '과전': 61,
 '수양버들': 62,
 '페럴': 63,
 '식힌다': 64,
 '번영': 65,
 '몇대': 66,
 '들어차': 67,
 '스머프': 68,
 '제멋': 69,
 'ICLEI': 70,
 '시끄러운': 71,
 '기영': 72,
 '투생': 73,
 '맞아떨어지': 74,
 '리쿠': 75,
 '성배': 76,
 '조각자': 77,
 '칙스': 78,
 '칭할': 79,
 'nH': 80,
 '헤스': 81,
 '가진다는': 82,
 '빙그레': 83,
 '都巡': 84,

In [28]:
id_to_word = []
for word in word_to_id.keys():
    id_to_word.append(word)

훈련 데이터를 만들자

In [20]:
train_data = []
for i in range(len(corpus)):
    train_data += sent_to_ngram(N, corpus[i])

In [21]:
train_data

[['1', '8', '3', '9'],
 ['8', '3', '9', '년'],
 ['3', '9', '년', '바그너'],
 ['9', '년', '바그너', '는'],
 ['년', '바그너', '는', '괴테'],
 ['바그너', '는', '괴테', '의'],
 ['는', '괴테', '의', '파우스트'],
 ['괴테', '의', '파우스트', '을'],
 ['의', '파우스트', '을', '처음'],
 ['파우스트', '을', '처음', '읽'],
 ['을', '처음', '읽', '고'],
 ['처음', '읽', '고', '그'],
 ['읽', '고', '그', '내용'],
 ['고', '그', '내용', '에'],
 ['그', '내용', '에', '마음'],
 ['내용', '에', '마음', '이'],
 ['에', '마음', '이', '끌려'],
 ['마음', '이', '끌려', '이'],
 ['이', '끌려', '이', '를'],
 ['끌려', '이', '를', '소재'],
 ['이', '를', '소재', '로'],
 ['를', '소재', '로', '해서'],
 ['소재', '로', '해서', '하나'],
 ['로', '해서', '하나', '의'],
 ['해서', '하나', '의', '교향곡'],
 ['하나', '의', '교향곡', '을'],
 ['의', '교향곡', '을', '쓰'],
 ['교향곡', '을', '쓰', '려는'],
 ['을', '쓰', '려는', '뜻'],
 ['쓰', '려는', '뜻', '을'],
 ['려는', '뜻', '을', '갖'],
 ['뜻', '을', '갖', '는다'],
 ['을', '갖', '는다', '.'],
 ['갖', '는다', '.', '이'],
 ['는다', '.', '이', '시기'],
 ['.', '이', '시기', '바그너'],
 ['이', '시기', '바그너', '는'],
 ['시기', '바그너', '는', '1'],
 ['바그너', '는', '1', '8'],
 ['는', '1', '8', '3'],


In [32]:
len(train_data)

3862986

In [34]:
def sent_to_vec(sent):
    vec = []
    for word in sent:
        vec.append(word_to_id[word])
    return vec

In [38]:
train_input = []
train_target = []
for data in train_data:
    train_input.append(sent_to_vec(data[:-1]))
    train_target.append(word_to_id[data[-1]])

In [39]:
len(train_input), len(train_target)

(3862986, 3862986)

In [40]:
train_input[0], train_target[0]

([50472, 65316, 52881], 9126)

In [262]:
train_input_ts = torch.tensor(train_input)
train_target_ts = torch.tensor(train_target)

In [264]:
train = torch.utils.data.TensorDataset(train_input_ts, train_target_ts)

### 모델을 만들어보자.

먼저 임베딩 벡터를 concatenate하는 방법에 대해 고민해보자. 

In [54]:
emb = nn.Embedding(len(vocab) * (N-1), 5)

In [55]:
vec = emb(torch.tensor([train_input[0]]))
vec.shape

torch.Size([1, 3, 5])

In [64]:
torch.flatten(vec[0])

tensor([-0.9689,  0.5371,  2.0395,  1.3356, -0.6704,  0.0093, -1.1420,  1.7707,
         1.9236,  0.6581,  1.2249, -0.0710,  0.6953, -0.9278, -0.0994],
       grad_fn=<AsStridedBackward>)

따라서 입력을 하나씩 받을 경우는 그냥 flatten 쓰면 되지만, 배치로 받을 때는?

In [73]:
vec = emb(torch.tensor([train_input[0],train_input[1]]))
vec.shape

torch.Size([2, 3, 5])

In [77]:
vec

tensor([[[-0.9689,  0.5371,  2.0395,  1.3356, -0.6704],
         [ 0.0093, -1.1420,  1.7707,  1.9236,  0.6581],
         [ 1.2249, -0.0710,  0.6953, -0.9278, -0.0994]],

        [[ 0.0093, -1.1420,  1.7707,  1.9236,  0.6581],
         [ 1.2249, -0.0710,  0.6953, -0.9278, -0.0994],
         [-3.7253,  0.1673,  0.1281,  0.9819, -0.5825]]],
       grad_fn=<EmbeddingBackward>)

In [80]:
vec_tu= torch.cat(tuple(vec),1)
vec_tu

tensor([[-0.9689,  0.5371,  2.0395,  1.3356, -0.6704,  0.0093, -1.1420,  1.7707,
          1.9236,  0.6581],
        [ 0.0093, -1.1420,  1.7707,  1.9236,  0.6581,  1.2249, -0.0710,  0.6953,
         -0.9278, -0.0994],
        [ 1.2249, -0.0710,  0.6953, -0.9278, -0.0994, -3.7253,  0.1673,  0.1281,
          0.9819, -0.5825]], grad_fn=<CatBackward>)

torch.cat은 안될 것 같고, 찾아보니 flatten에 start_dim 옵션이 있음!

In [82]:
torch.flatten??

In [81]:
vec_tu= torch.flatten(vec,1)
vec_tu

tensor([[-0.9689,  0.5371,  2.0395,  1.3356, -0.6704,  0.0093, -1.1420,  1.7707,
          1.9236,  0.6581,  1.2249, -0.0710,  0.6953, -0.9278, -0.0994],
        [ 0.0093, -1.1420,  1.7707,  1.9236,  0.6581,  1.2249, -0.0710,  0.6953,
         -0.9278, -0.0994, -3.7253,  0.1673,  0.1281,  0.9819, -0.5825]],
       grad_fn=<AsStridedBackward>)

원하는 결과를 얻음!

근데 어차피 Linear layer가 필요하므로 원핫인코딩을 해야함...

In [91]:
batch = torch.tensor([train_input[0],train_input[1]])
batch.shape

torch.Size([2, 3])

인덱스 벡터를 받으면 원핫인코딩 벡터를 반환하는 함수를 만들자

In [113]:
def vec_to_onehot(vec, oh_len):
    ohv = torch.zeros((batch.shape[0], batch.shape[1], oh_len))
    for i in range(vec.shape[0]):
        for j in range(vec.shape[1]):
            ohv[i][j][vec[i][j]] = 1
            
    return ohv    

In [116]:
vec_to_onehot(batch, len(vocab)).shape

torch.Size([2, 3, 83593])

In [120]:
vec_oh = vec_to_onehot(batch, len(vocab))
vec_oh_tu= torch.flatten(vec_oh,1)

In [122]:
vec_oh_tu.shape

torch.Size([2, 250779])

In [117]:
lin = nn.Linear(len(vocab) * (N-1), 5)

In [124]:
lin(vec_oh_tu).shape

torch.Size([2, 5])

아예 이 과정을 하나의 함수로 만들자.

In [127]:
def batch_to_input(batch, oh_len):
    ohv = torch.zeros((batch.shape[0], batch.shape[1], oh_len))
    for i in range(batch.shape[0]):
        for j in range(batch.shape[1]):
            ohv[i][j][batch[i][j]] = 1
            
    return torch.flatten(ohv,1)    

In [129]:
batch_to_input(batch, len(vocab)).shape

torch.Size([2, 250779])

In [131]:
lin(batch_to_input(batch, len(vocab))).shape

torch.Size([2, 5])

생각해보니 착각을 했다. 모델의 입력은 인덱스로 받아서 먼저 벡터 임베딩 후 Linear layer에 태우면 되니까, 그냥 임베딩 써도 됨

In [153]:
loss_f = nn.CrossEntropyLoss()

In [295]:
class NPLM(nn.Module):
    def __init__(self, n, vec_dim, hidden_dim):
        super(NPLM,self).__init__()
        self.emb = nn.Embedding(len(vocab), vec_dim)
        self.lin1 = nn.Linear(vec_dim * (n-1), hidden_dim)
        self.lin2 = nn.Linear(hidden_dim, len(vocab), bias=False)
        self.lin3 = nn.Linear(vec_dim * (n-1), len(vocab))
        
    def forward(self, x):
        # x : [BATCH_SIZE, N-1]
        # self.emb(x) : [BATCH_SIZE, N-1, VEC_DIM]
        x = self.emb(x).view(len(x),-1)
        y = torch.tanh(self.lin1(x))
        z = self.lin3(x) + self.lin2(y)
        return z

In [296]:
model = NPLM(N, VEC_DIM, HIDDEN_DIM)

In [155]:
target_batch = torch.tensor([train_target[0],train_target[1]])
target_batch

tensor([ 9126, 52737])

In [157]:
loss_f(output, target_batch)

tensor(11.2042, grad_fn=<NllLossBackward>)

이 loss가 올바른 결과인지 확인해보자

In [172]:
F.softmax(output, 1) # output을 소프트맥스 변환

tensor([[1.4428e-05, 1.2722e-05, 2.4266e-05,  ..., 7.4909e-06, 8.3130e-06,
         2.4767e-06],
        [1.2748e-05, 1.0737e-05, 1.9825e-05,  ..., 8.3171e-06, 2.4149e-05,
         3.8782e-06]], grad_fn=<SoftmaxBackward>)

In [173]:
F.softmax(output, 1)[0].sum() # 차원이 맞게 되었는지 확인

tensor(1.0000, grad_fn=<SumBackward0>)

In [174]:
torch.log(F.softmax(output, 1)[0][target_batch[0]]) # 출력의 target번째 좌표의 로그값

tensor(-11.4189, grad_fn=<LogBackward>)

In [170]:
torch.log(F.softmax(output, 1)[1][target_batch[1]])

tensor(-10.9895, grad_fn=<LogBackward>)

In [171]:
-(torch.log(F.softmax(output, 1)[0][target_batch[0]]) + torch.log(F.softmax(output, 1)[1][target_batch[1]]))/2

tensor(11.2042, grad_fn=<DivBackward0>)

따라서 각각의 negative loss의 평균이다! 맞게 계산됨

### 훈련 과정

In [297]:
BATCH_SIZE = 64
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [298]:
input_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)

In [269]:
x, y = next(iter(input_loader))

In [270]:
x.shape

torch.Size([64, 3])

In [271]:
y.shape

torch.Size([64])

앞에서 임베딩도 잘못 했음...

In [273]:
emb = nn.Embedding(len(vocab), 10)

In [281]:
emb(x).shape

torch.Size([64, 3, 10])

In [283]:
out = emb(x)

In [285]:
out

tensor([[[-1.3797, -0.5497, -0.1819,  ...,  0.2293,  1.3921,  0.3705],
         [-0.9443, -1.9360,  1.0009,  ...,  1.3524, -0.2080,  1.1619],
         [ 1.2290, -0.0642,  0.6032,  ..., -0.1276,  0.3939,  0.1067]],

        [[-0.9443, -1.9360,  1.0009,  ...,  1.3524, -0.2080,  1.1619],
         [ 1.2290, -0.0642,  0.6032,  ..., -0.1276,  0.3939,  0.1067],
         [ 0.6412, -0.5177,  1.2241,  ..., -0.3652, -0.4658,  1.4531]],

        [[ 1.2290, -0.0642,  0.6032,  ..., -0.1276,  0.3939,  0.1067],
         [ 0.6412, -0.5177,  1.2241,  ..., -0.3652, -0.4658,  1.4531],
         [-0.6553, -0.0878,  0.1446,  ..., -0.9161,  1.3431,  0.0070]],

        ...,

        [[-0.6617,  0.6439, -0.0419,  ..., -0.3900,  0.9162, -0.4438],
         [ 0.4282, -0.8271, -0.3454,  ..., -1.4361,  0.6106, -0.6443],
         [ 1.0708, -1.7279,  0.0475,  ...,  1.2346, -0.0447,  0.2068]],

        [[ 0.4282, -0.8271, -0.3454,  ..., -1.4361,  0.6106, -0.6443],
         [ 1.0708, -1.7279,  0.0475,  ...,  1.2346, -0.

In [284]:
out.view(BATCH_SIZE,-1)

tensor([[-1.3797, -0.5497, -0.1819,  ..., -0.1276,  0.3939,  0.1067],
        [-0.9443, -1.9360,  1.0009,  ..., -0.3652, -0.4658,  1.4531],
        [ 1.2290, -0.0642,  0.6032,  ..., -0.9161,  1.3431,  0.0070],
        ...,
        [-0.6617,  0.6439, -0.0419,  ...,  1.2346, -0.0447,  0.2068],
        [ 0.4282, -0.8271, -0.3454,  ..., -0.0195, -1.1323, -1.1402],
        [ 1.0708, -1.7279,  0.0475,  ...,  0.7455, -1.3573, -1.6670]],
       grad_fn=<ViewBackward>)

In [305]:
import time

In [309]:
# CPU time
start = time.time()

epoch = 1
loss_list = []

for epo in range(epoch):
    loss_sum = 0
    for i, (x, y) in enumerate(input_loader):

        
        optimizer.zero_grad()
        
        output = model(x)
        
        loss = loss_f(output, y)
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        
        
        if i % 50 == 0:        
            loss_list.append(loss_sum/50)
            print('Epoch : {}, Iteration : {}, Loss : {:.2f}, Elapsed time : {:.0f}s'.format(epo, i, loss_sum/50, time.time()-start))
            loss_sum = 0

Epoch : 0, Iteration : 0, Loss : 0.02, Elapsed time : 1
Epoch : 0, Iteration : 50, Loss : 4.04, Elapsed time : 43
Epoch : 0, Iteration : 100, Loss : 3.65, Elapsed time : 85
Epoch : 0, Iteration : 150, Loss : 3.99, Elapsed time : 127


KeyboardInterrupt: 

CPU 에서 GPU 로 옮겨서 해보자

In [311]:
model.to(dev)

NPLM(
  (emb): Embedding(83593, 100)
  (lin1): Linear(in_features=300, out_features=100, bias=True)
  (lin2): Linear(in_features=100, out_features=83593, bias=False)
  (lin3): Linear(in_features=300, out_features=83593, bias=True)
)

In [312]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [313]:
# CPU time
start = time.time()

epoch = 1
loss_list = []

for epo in range(epoch):
    loss_sum = 0
    for i, (x, y) in enumerate(input_loader):

        x,y = x.to(dev), y.to(dev)
        optimizer.zero_grad()
        
        output = model(x)
        
        loss = loss_f(output, y)
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        
        
        if i % 50 == 0:        
            loss_list.append(loss_sum/50)
            print('Epoch : {}, Iteration : {}, Loss : {:.2f}, Elapsed time : {:.0f}s'.format(epo, i, loss_sum/50, time.time()-start))
            loss_sum = 0

Epoch : 0, Iteration : 0, Loss : 0.04, Elapsed time : 1s
Epoch : 0, Iteration : 50, Loss : 2.34, Elapsed time : 2s
Epoch : 0, Iteration : 100, Loss : 2.49, Elapsed time : 4s
Epoch : 0, Iteration : 150, Loss : 2.62, Elapsed time : 6s
Epoch : 0, Iteration : 200, Loss : 5.17, Elapsed time : 8s
Epoch : 0, Iteration : 250, Loss : 5.63, Elapsed time : 10s
Epoch : 0, Iteration : 300, Loss : 5.91, Elapsed time : 12s
Epoch : 0, Iteration : 350, Loss : 6.03, Elapsed time : 14s
Epoch : 0, Iteration : 400, Loss : 6.05, Elapsed time : 16s


KeyboardInterrupt: 

대략 43/2 = 20배 정도 빨라짐... 갓 GPU

In [315]:
check = 5000
epoch = 10

In [316]:
# CPU time
start = time.time()

epoch = 1
loss_list = []

for epo in range(epoch):
    loss_sum = 0
    for i, (x, y) in enumerate(input_loader):

        x,y = x.to(dev), y.to(dev)
        optimizer.zero_grad()
        
        output = model(x)
        
        loss = loss_f(output, y)
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        
        
        if i % check == 0:        
            loss_list.append(loss_sum/check)
            print('Epoch : {}, Iteration : {}, Loss : {:.2f}, Elapsed time : {:.0f}s'.format(epo, i, loss_sum/check, time.time()-start))
            loss_sum = 0

Epoch : 0, Iteration : 0, Loss : 0.00, Elapsed time : 0s
Epoch : 0, Iteration : 5000, Loss : 13.09, Elapsed time : 188s
Epoch : 0, Iteration : 10000, Loss : 13.52, Elapsed time : 373s
Epoch : 0, Iteration : 15000, Loss : 13.55, Elapsed time : 558s
Epoch : 0, Iteration : 20000, Loss : 13.87, Elapsed time : 743s
Epoch : 0, Iteration : 25000, Loss : 13.53, Elapsed time : 928s
Epoch : 0, Iteration : 30000, Loss : 13.86, Elapsed time : 1113s
Epoch : 0, Iteration : 35000, Loss : 13.95, Elapsed time : 1298s
Epoch : 0, Iteration : 40000, Loss : 13.94, Elapsed time : 1482s
Epoch : 0, Iteration : 45000, Loss : 14.09, Elapsed time : 1667s
Epoch : 0, Iteration : 50000, Loss : 14.32, Elapsed time : 1852s
Epoch : 0, Iteration : 55000, Loss : 14.30, Elapsed time : 2036s
Epoch : 0, Iteration : 60000, Loss : 14.21, Elapsed time : 2220s


RuntimeError: shape '[64, -1]' is invalid for input of size 3000

In [None]:
save_path = 'NPLM_{}epoch.pt'.format(epoch)
torch.save(model.state_dict(), save_path)