RNN 기반 Language Model

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#import torchvision
import torch.optim as optim
print(torch.__version__)
print(torch.cuda.is_available())


2.0.1
False


In [15]:
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import tqdm
import os
import random
import time
import datetime

# for reproducibility
random.seed(1234)
np.random.seed(1234)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Dataset: Wikipedia wikitree dataset

In [16]:
import urllib
with urllib.request.urlopen('https://raw.githubusercontent.com/yunjey/pytorch-tutorial/master/tutorials/02-intermediate/language_model/data/train.txt') as f:
    data = f.readlines()

In [17]:
print('num_sentence:', len(data))
data[100]

num_sentence: 42068


b" plans that give advertisers discounts for maintaining or increasing ad spending have become permanent <unk> at the news <unk> and underscore the fierce competition between newsweek time warner inc. 's time magazine and <unk> b. <unk> 's u.s. news & world report \n"

In [None]:
seq_length_list = [] #data list에 넣기
for line in data:
    seq_length_list.append(len(line.split()))

counts, bins = np.histogram(seq_length_list, bins=20)
plt.hist(bins[:-1], bins, weights=counts)
plt.show()

In [21]:
max_seq_len = 50 #문장 길이 최대 50

단어 dictionary 만들기 (함수 build dictionary)

In [22]:
def build_dictionary(data, max_seq_len):
    word2idx = {}
    idx2word = {}
    ## Build Dictionary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1 
    idx2word[0] , idx2word[1] = '<pad>', '<unk>'
    idx =2
    for line in data:
        words = line.decode('utf-8').split()
        words = words[:max_seq_len]
        for word in words:
            if word not in word2idx:
                word2idx[word] = idx
                idx2word[idx] = word 
                idx += 1
                
    return word2idx, idx2word

word2idx, idx2word = build_dictionary(data, max_seq_len)

In [23]:
if len(word2idx) == len(idx2word) == 10000:
    print('Test Passed!')
else:
    raise AssertionError

Test Passed!


In [24]:
def preprocess(data, word2idx, idx2word, max_seq_len):
    tokens = []
    for line in data:
        words = line.decode('utf-8').split()
        words = words[:max_seq_len]
        words += ['<pad>']*(max_seq_len - len(words))
        for word in words:
            token = word2idx[word]
            tokens.append(token)

    return tokens

tokens = preprocess(data, word2idx, idx2word, max_seq_len)

In [25]:
if len(tokens) == 2103400:
    print("Test Passed!")
else:
    raise AssertionError

Test Passed!


In [26]:
tokens = np.array(tokens).reshape(-1, max_seq_len)
print(tokens.shape)
tokens[100]

(42068, 50)


array([745,  93, 746, 739, 747, 181, 748, 467, 749, 740, 750, 154, 751,
       752,   1, 160,  32, 753,   1,  48, 754,  32, 755, 756, 757, 728,
       555, 758,  99, 119, 555, 733,  48,   1, 759,   1, 119, 237, 753,
       230, 760, 347,   0,   0,   0,   0,   0,   0,   0,   0])

DataLoader

In [27]:
class LMDataset(torch.utils.data.Dataset):
    def __init__(self, tokens):
        super(LMDataset, self).__init__()
        self.PAD = 0
        self.UNK = 1
        self.tokens = tokens
        self._getitem(2)
    
    def _getitem(self, index):
        X = self.tokens[index]
        y = np.concatenate((X[1:], [self.PAD]))

        X = torch.from_numpy(X).unsqueeze(0).long()
        y = torch.from_numpy(y).unsqueeze(0).long()

        return X, y

    def __getitem__(self, index):
        X = self.tokens[index]
        y = np.concatenate((X[1:], [self.PAD]))

        X = torch.from_numpy(X).long()
        y = torch.from_numpy(y).long()

        return X, y

    def __len__(self):
        return len(self.tokens)    

In [28]:
batch_size = 64
dataset = LMDataset(tokens)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

print(len(dataset))
print(len(dataloader))

42068
658


2.Model
RNN (Recurrent Neural Network) 구조인 LSTM Model

In [30]:
class LSTMCell(nn.Module): #hidden layer에 해당하는 부분을 만든 것
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.Wi = nn.Linear(input_size +hidden_size, hidden_size) #input gate
        self.Wf = nn.Linear(input_size + hidden_size, hidden_size) #forget-gate
        self.Wg = nn.Linear(input_size + hidden_size, hidden_size) #gate-gate
        self.Wo = nn.Linear(input_size + hidden_size, hidden_size) #output-gate
    
    def forward(self, x, h_0, c_0):
        """
        Inputs
            input (x): [batch_size, input_size]
            hidden_state (h_0): [batch_size, hidden_size]
            cell_state (c_0): [batch_size, hidden_size]
        Outputs
            next_hidden_state (h_1): [batch_size, hidden_size]
            next_cell_state (c_1): [batch_size, hidden_size]    
        """
        h_1, c_1 = None, None
        input = torch.cat((x, h_0), 1)
        
        i = self.sigmoid(self.Wi(input))
        f = self.sigmoid(self.Wf(input))
        g = self.tanh(self.Wg(input))
        o = self.sigmoid(self.Wo(input))
        
        c_1 = f* c_0 + i*g 
        h_1 = o * self.tanh(c_1)
        
        return h_1, c_1

In [31]:
class LanguageModel(nn.Module):
    def __init__(self, input_size=64, hidden_size=64, vocab_size= 10000):
        super(LanguageModel, self).__init__()
        
        self.input_layer = nn.Embedding(vocab_size, input_size)
        self.hidden_layer = LSTMCell(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hx, cx, predict= False):
        x = self.input_layer(x)
        hx, cx = self.hidden_layer(x, hx, cx)
        ox = self.output_layer(hx)
        
        if predict == True:
            probs = F.softmax(ox, dim=1)
            dist = torch.distributions.Categorical(probs)
            ox = dist.sample()
        
        return ox, hx, cx #out, hidden, cell state
    

3.Training

In [None]:
class Trainer():
    def __init__( self, word2idx, idx2word, dataloader,hidden_size, model, criterion, optimizer, device):
        self.word2idx = word2idx
        self.idx2word = idx2word 
        self.dataloader = dataloader
        self.hiddensize = hidden_size
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        
    def train(self, epochs = 1):
        self.model.to(self.device)
        start_time = time.time()
        for epoch in range(epochs):
            losses = []  #loss 저장하기 위함
            for iter, (x_batch, y_batch) in tqdm.tqdm(enumerate(self.dataloader)):
                self.model.train()
                
                batch_size, max_seq_len = x_batch.shape
                x_batch = x_batch.to(self.device)
                y_batch = y_batch.to(self.device)
                
                  # initial hidden-states
                hx = torch.zeros(batch_size, self.hidden_size).to(self.device)
                cx = torch.zeros(batch_size, self.hidden_size).to(self.device)

                # Implement LSTM operation
                ox_batch = []
                for s_idx in range(max_seq_len):
                    x = x_batch[:, s_idx]
                    ox, hx, cx = self.model(x, hx, cx)
                    ox_batch.append(ox)
                # outputs are ordered by the time sequence
                ox_batch = torch.cat(ox_batch).reshape(max_seq_len, batch_size, -1)
                ox_batch = ox_batch.permute(1,0,2).reshape(batch_size*max_seq_len, -1)
                y_batch = y_batch.reshape(-1)

                ##train start 
                self.model.zero_grad() #back prop 하기 전에 모델의 parameter값 초기화
                loss = self.criterion(ox_batch, y_batch)
                loss.backward() #backprop, 여기가 gradient 다시 update하는 곳
                self.optimizer.step()
                losses.append(loss.item()) # loss 값 저장                

In [None]:
lr = 1e-2
input_size, hidden_size, batch_size =128, 128, 256

dataset = LMDataset(tokens)
dataloader = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle=True)
model = LanguageModel(input_size=input_size, hidden_size= hidden_size)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer=  optim.Adam(model.parameters(), lr=lr)
device = torch.device('cuda')

trainer = Trainer(word2idx = word2idx,
                  idx2word = idx2word,
                  dataloader=dataloader, 
                  model = model,
                  criterion=criterion,
                  optimizer = optimizer,
                  device=device)

trainer.train(epochs=50)