## TODO
~~1. Batching!~~   
    ~~-Encoder.Forward의 input 모양 어떻게 되지? / .view 인자 확인!~~
2. Attention  
3. Teacher Forcing  
~~4. Parameter(things to be updated) 등록 잘 됐나 확인(= 그래프 잘 그린 건지 어케 확인하나?)~~
5. Train / Dev 사전에 나누기
6. Pretrained word vector을 쓸 수가 있나..?  
** SOS, EOS, PAD token 관리 어디서?**   
** INTO GPU: x batch, y batch, parameter On GPU**
7. model save등등 train 뒷단 얘기  
    -torch.save


#### NOTE
1. Decoder가 2 layer일때, initial hidden?  
    - https://discuss.pytorch.org/t/understanding-output-of-lstm/12320/2
    - hidden 의 dimension
2. embedding 거친 후엔 batch_size x seq_len x embedding_dim 임!
    - embedding input은 batch_size x seq_len
2. LSTM의 batch_first
3. LSTM input의 dimension
4. NLLLoss()의 dimension

In [71]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm_notebook

from data_loader import get_dataset_loader

In [72]:
class Config():        
    
    # Data 
    csv_file = 'data/complete_df.csv'
    vocab_file = 'crawling/Reviews_csv/vocab.txt'
    tag_vocab = 'crawling/tags_txt/tag_vocab.txt'
    rating_dict = {'불만':0, '추천안함':0,
                    '보통':1,
                    '추천':2, '만족':2,
                    '적극추천':3}
    category = 'subcat'    # 'subcat' or 'category'
    padding_idx = 0
    
    def add_info(self, rating_size, category_size, tag_size, output_size):
        """data loading이후에 결정되는 것들"""
        self.rating_size = rating_size
        self.category_size = category_size
        self.tag_size = tag_size
        self.output_size = output_size
        
        
    # Encoder
    # pretrained = False 
    attribute_size = 64
    
    # Decoder
    hidden_size = 512 
    num_layers = 2
    
    # training
    batch_size = 30
    dropout = 0.2
    num_steps = 100
    print_every = 1

In [73]:
class Encoder(nn.Module):
    def __init__(self, config):        
        super().__init__()
        self.config = config
        
        self.emb_rating = nn.Embedding(self.config.rating_size, self.config.attribute_size)   
        self.emb_category = nn.Embedding(self.config.category_size, self.config.attribute_size)
        self.emb_tag = nn.Embedding(self.config.tag_size, self.config.attribute_size,
                                   padding_idx=self.config.padding_idx)        
        self.out = nn.Linear(self.config.attribute_size * 3, self.config.hidden_size*self.config.num_layers)
        self.init_hidden()
        
    def forward(self, rating, category, tag):
        """
        Inputs:
            rating: TENSOR of shape (batch_size, 1)
            category: TENSOR of shape (batch_size, 1)
            tag : 1) TENSOR of shape (batch_size, tag_MAXLEN)
        Returns:
            concatenated attr for attention, encoder_output
        """
        
        assert len(rating) == len(category) == len(tag)
        attr_rating = self.emb_rating(rating)    
        attr_category = self.emb_category(category)
        tag_len = self.get_tag_len(tag)    
        attr_tag = torch.sum(self.emb_tag(tag), 1, keepdim=True) / tag_len    # CBOW
        
        attr = torch.cat((attr_rating, attr_category, attr_tag), 2)
        out = self.out(attr)
        encoder_output = F.tanh(out)
        return attr, encoder_output
    
    def get_tag_len(self, tag): 
        """padding 제외한 token 개수"""
        return torch.sum(tag!=self.config.padding_idx, 1).unsqueeze(1).unsqueeze(1).type(torch.float)
        
    def init_hidden(self):
        for param in self.parameters():
            nn.init.uniform_(param, -0.08, 0.08)

In [74]:
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        # TODO: if self.config.pretrained = True
        self.embedding = nn.Embedding(self.config.output_size, self.config.hidden_size)
        self.lstm = nn.LSTM(self.config.hidden_size, self.config.hidden_size, \
                            num_layers=self.config.num_layers, dropout=self.config.dropout, \
                           batch_first=True)
        self.out = nn.Linear(self.config.hidden_size, self.config.output_size)
        
    def forward(self, input_token, hidden):
        """
        Inputs:
            input_token: TENSOR of shape (batch_size, 1)
            hidden: from last hidden of encoder
        Returns:
        """
        # 가운데 1이니까 unroll방식으로만!
        output = self.embedding(input_token)
        # LSTM의 hidden은 (hx, cx)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = F.log_softmax(output, dim=2)
        return output, hidden

    def initHidden(self):
        pass

In [75]:
def splitHidden(encoder_output, config):
    h_0 = encoder_output.view(config.num_layers, encoder_output.size(0), \
                              config.hidden_size)
    c_0 = torch.zeros_like(h_0) 
    return (h_0, c_0)

def train(encoder, decoder, dataloader, loss_fn, optimizer, config, verbose=False):
    encoder.train()
    decoder.train()
    data_iter = iter(dataloader)
    
    for t in tqdm_notebook(range(config.num_steps)):
        optimizer.zero_grad()
        rating_tensor, category_tensor, tag_tensor, target_tensor = next(data_iter)
        target_length = target_tensor.size(-1)
        
        attr, encoder_output = encoder(rating_tensor, category_tensor, tag_tensor)
        decoder_hidden = splitHidden(encoder_output, encoder.config)        
        
        decoder_input = torch.zeros((config.batch_size,1)).long()    # SOS token
        decoder_outputs = []
        for idx in range(target_length): 
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.detach().view(config.batch_size, 1)
            decoder_outputs.append(decoder_output)

        # 이 아래 두 개 shape 조정은 loss function 요구 사항 맞추기 위함!
        decoder_outputs = torch.cat(decoder_outputs, 1).view(config.batch_size*target_length, -1)
        target_tensor = target_tensor.view(-1)
        loss = loss_fn(decoder_outputs, target_tensor) 
        num_actual_token = torch.sum(target_tensor != encoder.config.padding_idx).item()
        loss /= num_actual_token
        
        if verbose==True and t % config.print_every == 0:
            print("loss at %d step: %f" % (t, loss))
            
        loss.backward()
        optimizer.step()     

### Get data & set config

In [76]:
config = Config()
dataset, dataloader = get_dataset_loader(config.csv_file, config.vocab_file, config.tag_vocab, config.rating_dict, \
                        config.category, config.batch_size)

rating_size = len(dataset.rating2idx)
category_size = len(dataset.category2idx)
tag_size = len(dataset.tag2idx)
vocab_size = len(dataset.word2idx)
config.add_info(rating_size, category_size, tag_size, vocab_size)

### Instantiate model and start training

In [None]:
encoder = Encoder(config)
decoder = Decoder(config)

params = list(encoder.parameters()) + list(decoder.parameters())
loss_fn = nn.NLLLoss(size_average=False, ignore_index=config.padding_idx)
optimizer = optim.Adam(params, lr=0.001)

train(encoder, decoder, dataloader, loss_fn, optimizer, config, verbose=True)

HBox(children=(IntProgress(value=0), HTML(value='')))

loss at 0 step: 10.103978
loss at 1 step: 10.082250
loss at 2 step: 10.058656
loss at 3 step: 10.012200
loss at 4 step: 9.802145
loss at 5 step: 9.612991
loss at 6 step: 9.367873
loss at 7 step: 9.009171
loss at 8 step: 9.006889
loss at 9 step: 9.006299
loss at 10 step: 8.575359
loss at 11 step: 8.614898
loss at 12 step: 8.040228
loss at 13 step: 8.029733
loss at 14 step: 8.648287
loss at 15 step: 8.455800
loss at 16 step: 8.504720
loss at 17 step: 8.548656
loss at 18 step: 8.236914
loss at 19 step: 8.195976
loss at 20 step: 8.577085
loss at 21 step: 8.271413
loss at 22 step: 8.236559
loss at 23 step: 8.435886
