In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

In [2]:
with open('datasets.txt', 'r', encoding='utf-8') as f:
    datasets = f.readlines()
    datasets = ''.join(datasets).split('\n')

In [3]:
datasets[::40]

['발 없는 말이 천리 간다',
 '모난 돌이 정 맞는다',
 '고래 싸움에 새우 등 터진다',
 '끝 부러진 송곳',
 '들으면 병이요  안 들으면 약이다',
 '봄눈 녹듯 한다',
 '열 손가락을 깨물어서 안 아픈 손가락 없다',
 '첫딸은 살림 밑천이다']

In [4]:
len(datasets)

315

In [5]:
word_list = " ".join(datasets).split()
word_list = list(set(word_list))
unk_token = '<UNK>'
word_list.append(unk_token)

In [6]:
word_dict = {w:i for i, w in enumerate(word_list)}
number_dict = {i:w for i, w in enumerate(word_list)}

In [7]:
VOCAB_SIZE = len(word_dict)
EMBED_SIZE = 30
HIDDEN_SIZE = 32
NGRAM_SIZE = 3

print('VOCAB_SIZE  :', VOCAB_SIZE)
print('EMBED_SIZE  :', EMBED_SIZE)
print('HIDDEN_SIZE :', HIDDEN_SIZE)
print('NGRAM_SIZE  :', NGRAM_SIZE)

VOCAB_SIZE  : 1008
EMBED_SIZE  : 30
HIDDEN_SIZE : 32
NGRAM_SIZE  : 3


In [8]:
for i, sen in enumerate(datasets):
    word_list = sen.split()
    if len(word_list) < NGRAM_SIZE:
        word_list.insert(0, unk_token)
        sen = ' '.join(word_list)
    datasets[i] = sen

In [9]:
datasets = [sen.replace('  ', ' ') for sen in datasets]

In [16]:
train_data = []
for sen in datasets:
    text = sen.strip().split(' ')
    for i in range(len(text) - (NGRAM_SIZE - 1)):
        train_data.append(' '.join(text[i:i+NGRAM_SIZE]))

In [121]:
def make_batch_generator(sentences, batch_size):
    input_batch = []
    target_batch = []
    batch_ix = 0
    for sentence in sentences[batch_ix:]:
        if batch_ix == batch_size:
            yield input_batch, target_batch
            input_batch = []
            target_batch = []
            batch_ix = 0
        words = sentence.split()
        input_ = [word_dict[n] for n in words[:-1]]
        target_ = word_dict[words[-1]]
        input_batch.append(input_)
        target_batch.append(target_)
        batch_ix += 1
        
train_generator = make_batch_generator(train_data, 32)

In [120]:
def make_batch(sentences):
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = sen.split()
        input_ = [word_dict[n] for n in word[:-1]]
        target_ = word_dict[word[-1]]
        
        input_batch.append(input_)
        target_batch.append(target_)
        
    return input_batch, target_batch

In [83]:
input_batch, target_batch = make_batch(train_data)

In [90]:
for i in range(len(target_batch) // 32 + 1):
    print(i * 32, (i+1) * 32)

0 32
32 64
64 96
96 128
128 160
160 192
192 224
224 256
256 288
288 320
320 352
352 384
384 416
416 448
448 480
480 512
512 544
544 576
576 608
608 640
640 672
672 704
704 736
736 768
768 800
800 832
832 864


In [21]:
input_batch = Variable(torch.LongTensor(input_batch[:32]))
target_batch = Variable(torch.LongTensor(target_batch[:32]))

In [22]:
input_batch

tensor([[359, 424],
        [424, 584],
        [584, 893],
        [813, 947],
        [947, 583],
        [612, 900],
        [900, 474],
        [474, 809],
        [809, 354],
        [354, 277],
        [277,   0],
        [  0,  24],
        [ 24, 264],
        [264, 713],
        [479, 767],
        [767, 381],
        [381, 335],
        [335, 903],
        [903,  21],
        [ 21, 939],
        [939, 873],
        [873, 360],
        [360, 131],
        [942,  65],
        [ 65, 339],
        [339, 997],
        [997, 330],
        [394, 133],
        [564, 883],
        [883,  92],
        [564, 688],
        [688, 953]])

In [23]:
target_batch

tensor([584, 893, 495, 583, 992, 474, 809, 354, 277,   0,  24, 264, 713, 252,
        381, 335, 903,  21, 939, 873, 360, 131, 806, 339, 997, 330, 732, 685,
         92, 205, 953, 888])

In [27]:
input_batch = input_batch.cuda()

In [28]:
target_batch = target_batch.cuda()

In [38]:
C = nn.Embedding(VOCAB_SIZE, EMBED_SIZE).cuda()
H = nn.Parameter(torch.randn((NGRAM_SIZE - 1) * EMBED_SIZE, HIDDEN_SIZE).type(dtype)).cuda()
W = nn.Parameter(torch.randn((NGRAM_SIZE - 1) * EMBED_SIZE, VOCAB_SIZE).type(dtype)).cuda()
d = nn.Parameter(torch.randn(HIDDEN_SIZE).type(dtype)).cuda()
U = nn.Parameter(torch.randn(HIDDEN_SIZE, VOCAB_SIZE).type(dtype)).cuda()
b = nn.Parameter(torch.randn(VOCAB_SIZE).type(dtype)).cuda()

In [43]:
input_batch.size()

torch.Size([32, 2])

In [46]:
list(C.parameters())[0].size()

torch.Size([1008, 30])

In [48]:
X = C(input_batch)

In [49]:
X.size()

torch.Size([32, 2, 30])

In [57]:
X.view(-1, (NGRAM_SIZE-1) * EMBED_SIZE).size()

torch.Size([32, 60])

In [58]:
X.view(32, -1).size()

torch.Size([32, 60])

In [62]:
X = X.view(32, -1)

In [64]:
tanh = torch.tanh(d + torch.mm(X, H))

In [65]:
tanh

tensor([[-0.9429, -1.0000, -0.9998,  ...,  0.9762, -0.9999,  0.9827],
        [-0.5350,  1.0000,  1.0000,  ..., -0.8565,  1.0000,  0.9929],
        [ 0.9966, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  0.9988],
        ...,
        [-0.3040,  1.0000, -0.9997,  ..., -1.0000,  0.9987,  0.4833],
        [-1.0000, -0.9961, -1.0000,  ...,  0.9943,  0.9964, -0.9790],
        [ 0.8429, -0.6702, -0.8870,  ..., -1.0000, -0.9822, -0.9526]],
       device='cuda:0', grad_fn=<TanhBackward>)

In [66]:
tanh.size()

torch.Size([32, 32])

In [68]:
output = b + torch.mm(X, W) + torch.mm(tanh, U)

In [69]:
output

tensor([[ -6.0588,  -8.7777,  16.5063,  ...,  11.1187,   7.6718,   7.4594],
        [ -7.8521,   7.7280,   9.7059,  ...,  -7.6543, -17.2225,   3.7947],
        [ 13.5479,  -1.4848,  -0.1105,  ..., -10.1636,  -9.1711,   9.1920],
        ...,
        [ 10.6150,  -2.1153,  -3.6730,  ...,  -6.7203,  -6.2340,   5.3053],
        [  8.2681,  14.5065,  11.0054,  ..., -10.2024,  14.5706,  17.0870],
        [ -1.3668,   3.6505, -12.9709,  ...,  -5.7174,  -2.6522,   5.4203]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [70]:
output.size()

torch.Size([32, 1008])

In [82]:
class NPLM(nn.Module):
    
    def __init__(self,
                 VOCAB_SIZE,
                 EMBED_SIZE=30,
                 HIDDEN_SIZE=32,
                 NGRAM_SIZE=2):
        super(NPLM, self).__init__()
        self.C = nn.Embedding(VOCAB_SIZE, EMBED_SIZE).cuda()
        self.H = nn.Parameter(torch.randn((NGRAM_SIZE-1)*EMBED_SIZE, HIDDEN_SIZE).type(dtype)).cuda()
        self.W = nn.Parameter(torch.randn((NGRAM_SIZE-1)*EMBED_SIZE, VOCAB_SIZE).type(dtype)).cuda()
        self.d = nn.Parameter(torch.randn(HIDDEN_SIZE).type(dtype)).cuda()
        self.U = nn.Parameter(torch.randn(HIDDEN_SIZE, VOCAB_SIZE).type(dtype)).cuda()
        self.b = nn.Parameter(torch.randn(VOCAB_SIZE).type(dtype)).cuda()
        
    def forward(self, X):
        X = self.C(X)
        X = X.view(-1, n_step * m)
        tanh = torch.tanh(self.d + torch.mm(X, self.H))
        output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U)
        return output

In [75]:
model = NPLM()

In [76]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [77]:
optimizer.zero_grad()

In [81]:
criterion(output, target_batch)

tensor(30.5323, device='cuda:0', grad_fn=<NllLossBackward>)