## TODO
~~1. Batching!~~   
    ~~-Encoder.Forward의 input 모양 어떻게 되지? / .view 인자 확인!~~
~~2. Attention~~
3. Teacher Forcing  
~~4. Parameter(things to be updated) 등록 잘 됐나 확인(= 그래프 잘 그린 건지 어케 확인하나?)~~
5. Train / Dev 사전에 나누기
6. Pretrained word vector을 쓸 수가 있나..?  
~~SOS, EOS, PAD token 관리 어디서?~~  
** INTO GPU: x batch, y batch, parameter On GPU**
7. model save등등 train 뒷단 얘기  
    -torch.save


#### NOTE
1. Decoder가 2 layer일때, initial hidden?  
    - https://discuss.pytorch.org/t/understanding-output-of-lstm/12320/2
    - hidden 의 dimension
2. embedding 거친 후엔 batch_size x seq_len x embedding_dim 임!
    - embedding input은 batch_size x seq_len
2. LSTM의 batch_first
3. LSTM input의 dimension
4. NLLLoss()의 dimension

In [163]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm_notebook

from data_loader import get_dataset_loader

In [164]:
class Config():        
    
    # Data 
    csv_file = 'data/complete_df.csv'
    vocab_file = 'crawling/Reviews_csv/vocab.txt'
    tag_vocab = 'crawling/tags_txt/tag_vocab.txt'
    rating_dict = {'불만':0, '추천안함':0,
                    '보통':1,
                    '추천':2, '만족':2,
                    '적극추천':3}
    category = 'subcat'    # 'subcat' or 'category'
        
    def add_dataset_info(self, dataset):
        """data loading이후에 결정되는 것들"""
        self.rating_size = len(dataset.rating2idx)
        self.category_size = len(dataset.category2idx)
        self.tag_size = len(dataset.tag2idx)
        self.output_size = len(dataset.word2idx)
        self.padding_idx = dataset.word2idx['PAD']  # 0
        self.SOS_token = dataset.word2idx['SOS']    # 1
        self.EOS_token = dataset.word2idx['EOS']    # 2
    # Encoder
    # pretrained = False 
    attribute_size = 64
    
    # Decoder
    hidden_size = 512 
    num_layers = 2
    num_attr = 3 # for attention!
    
    # training
    batch_size = 2
    dropout = 0.2
    num_steps = 100
    print_every = 1

In [165]:
class Encoder(nn.Module):
    def __init__(self, config):        
        super().__init__()
        self.config = config
        
        self.emb_rating = nn.Embedding(self.config.rating_size, self.config.attribute_size)   
        self.emb_category = nn.Embedding(self.config.category_size, self.config.attribute_size)
        self.emb_tag = nn.Embedding(self.config.tag_size, self.config.attribute_size,
                                   padding_idx=self.config.padding_idx)        
        self.out = nn.Linear(self.config.attribute_size * 3, self.config.hidden_size*self.config.num_layers)
        self.init_hidden()
        
    def forward(self, rating, category, tag):
        """
        Inputs:
            rating: TENSOR of shape (batch_size, 1)
            category: TENSOR of shape (batch_size, 1)
            tag : 1) TENSOR of shape (batch_size, tag_MAXLEN)
        Returns:
            concatenated attr for attention, encoder_output
        """
        
        assert len(rating) == len(category) == len(tag)
        attr_rating = self.emb_rating(rating)        # N x 1 x attr_size
        attr_category = self.emb_category(category)  # N x 1 x attr_size  
        tag_len = self.get_tag_len(tag)              
        attr_tag = torch.sum(self.emb_tag(tag), 1, keepdim=True) / tag_len    # CBOW
                                                     # N x max_tag_len x attr_size
                                                     # N x 1 x attr_size*3
        attr = torch.cat((attr_rating, attr_category, attr_tag), 2)
        out = self.out(attr)    # N x 1 x hidden_size * num_layers(decoder)
        attr = attr.view(self.config.batch_size, self.config.num_attr, -1)  # N x 3 x 64
        encoder_output = F.tanh(out)
        return attr, encoder_output
    
    def get_tag_len(self, tag): 
        """padding 제외한 token 개수"""
        return torch.sum(tag!=self.config.padding_idx, 1).unsqueeze(1).unsqueeze(1).type(torch.float)
        
    def init_hidden(self):
        for param in self.parameters():
            nn.init.uniform_(param, -0.08, 0.08)

In [166]:
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.embedding = nn.Embedding(self.config.output_size, self.config.hidden_size)
        self.lstm = nn.LSTM(self.config.hidden_size, self.config.hidden_size, \
                            num_layers=self.config.num_layers, dropout=self.config.dropout, \
                           batch_first=True)
        self.out = nn.Linear(self.config.hidden_size, self.config.output_size)
        
    def forward(self, input_token, hidden):
        """
        Inputs:
            input_token: TENSOR of shape (batch_size, 1)
            hidden: from last hidden of encoder (h_0, c_0) batch first
                        h_0 - num_layers * num_direction X batch X hidden_size
                        c_0 - num_layers * num_direction X batch X hidden_size
        Returns:
        """
        # 가운데 1이니까 unroll방식으로만!  - 바꿀 수 있나?!
        output = self.embedding(input_token)          # N x 1(seq_len) x hidden_size
        # LSTM의 hidden은 (hx, cx)
        output, hidden = self.lstm(output, hidden)    # N x 1(seq_len) x hidden_size * num_dir
                                                      # num_layers * num_direction x N x hidden_size
        output = self.out(output)    # N x 1(seq_len) x output_size
        output = F.log_softmax(output, dim=2)
        return output, hidden

    def initHidden(self):
        pass

In [167]:
class Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.attn_score = nn.Linear(self.config.hidden_size + self.config.attribute_size, 1)
        
    def forward(self, last_hidden, attrs): 
        # last_hidden : torch.Size([num_layers*num_direction, seq_len, hidden_dim])
        # attrs : torch.Size([batch_size, num_attr, attr_size])
        attn_energies = torch.zeros((self.config.batch_size, 1, self.config.num_attr), requires_grad=True)
        # B x 1(seq_len인가?) x 3
        for i in range(self.config.num_attr):
            attn_energies[:,:,i] = self.score(last_hidden.squeeze(), attrs[:,i,:])
        return F.softmax(attn_energies, dim=-1)#.unsqueeze(0).unsqueeze(0) # 1,1,3
        
    def score(self, last_hidden, attr):  
        energy = self.attn_score(torch.cat((last_hidden, attr.squeeze()), -1))
        energy = F.tanh(energy)    # (batch, 1)
        return energy

In [168]:
class AttnDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.embedding = nn.Embedding(self.config.output_size, self.config.hidden_size)
        self.lstm = nn.LSTM(self.config.hidden_size, self.config.hidden_size, \
                            num_layers=self.config.num_layers, dropout=self.config.dropout, \
                           batch_first=True)
        self.attn_out = nn.Linear(self.config.hidden_size + self.config.attribute_size, 
                                  self.config.hidden_size)
        self.out = nn.Linear(self.config.hidden_size, self.config.output_size)

        self.attn = Attention(self.config)
            
    def forward(self, input_token, hidden, attrs):
        
        word_embedded = self.embedding(input_token)
        output, hidden = self.lstm(word_embedded, hidden)
        
        attn_weights = self.attn(output, attrs) 

        attrs = attrs.view(self.config.batch_size, self.config.num_attr, -1)
        context = attn_weights.bmm(attrs)
        
        output = F.tanh(self.attn_out(torch.cat((output, context), -1)))
        output = F.log_softmax(self.out(output), dim=-1)
        
        return output, hidden, attn_weights

In [169]:
# def splitHidden(encoder_output, config):
#                                                 # encoder_output.size(0) = batch_size
#     h_0 = encoder_output.view(config.num_layers, encoder_output.size(0), \
#                               config.hidden_size)
#     c_0 = torch.zeros_like(h_0) 
#     return (h_0, c_0)

# config = Config()
# dataset, dataloader = get_dataset_loader(config.csv_file, config.vocab_file, config.tag_vocab, config.rating_dict, \
#                         config.category, config.batch_size)
# config.add_dataset_info(dataset)

# dataiter = iter(dataloader)
# rating, category, tag, target = next(dataiter)

# encoder = Encoder(config)
# attndecoder = AttnDecoder(config)
# attrs, encoder_output = encoder(rating, category, tag)
# hidden = splitHidden(encoder_output, encoder.config)
# decoder_input = torch.zeros((config.batch_size,1)).long()
# output, context, hidden, attn_weights = attndecoder(decoder_input, hidden, attrs)

# print(output.size())
# print(context.size())
# print("*"*10)
# print(hidden[0].size())    # hidden 은 batch first랑 상관 없음!!
# print(hidden[1].size())
# print("*"*10)
# print(attn_weights.size())

In [170]:
def train(encoder, decoder, dataloader, loss_fn, optimizer, config, verbose=False):
    encoder.train()
    decoder.train()
    data_iter = iter(dataloader)
    
    def splitHidden(encoder_output):
        h_0 = encoder_output.view(config.num_layers, config.batch_size, \
                              config.hidden_size)
        c_0 = torch.zeros_like(h_0) 
        return (h_0, c_0)
    
    for t in tqdm_notebook(range(config.num_steps)):
        optimizer.zero_grad()
        rating_tensor, category_tensor, tag_tensor, target_tensor = next(data_iter)
        target_length = target_tensor.size(-1)
        
        attrs, encoder_output = encoder(rating_tensor, category_tensor, tag_tensor)
        decoder_hidden = splitHidden(encoder_output)        
        
        decoder_input = config.SOS_token * torch.ones((config.batch_size,1)).long() 
        decoder_outputs = []
        for idx in range(target_length): 
            decoder_output, decoder_hidden, attention_weights = \
                                decoder(decoder_input, decoder_hidden, attrs)            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.detach().view(config.batch_size, 1)
            decoder_outputs.append(decoder_output)

        # 이 아래 두 개 shape 조정은 loss function 요구 사항 맞추기 위함!
        decoder_outputs = torch.cat(decoder_outputs, 1).view(config.batch_size*target_length, -1)
        target_tensor = target_tensor.view(-1)
        loss = loss_fn(decoder_outputs, target_tensor) 
        num_actual_token = torch.sum(target_tensor != encoder.config.padding_idx).item()
        loss /= num_actual_token
        
        if verbose==True and t % config.print_every == 0:
            print("loss at %d step: %f" % (t, loss))
            
        loss.backward()
        optimizer.step()     

### Get data & set config

In [171]:
config = Config()
dataset, dataloader = get_dataset_loader(config.csv_file, config.vocab_file, config.tag_vocab, config.rating_dict, \
                        config.category, config.batch_size)
config.add_dataset_info(dataset)

### Instantiate model and start training

In [172]:
encoder = Encoder(config)
decoder = AttnDecoder(config)

params = list(encoder.parameters()) + list(decoder.parameters())
loss_fn = nn.NLLLoss(size_average=False, ignore_index=config.padding_idx)
optimizer = optim.Adam(params, lr=0.001)

train(encoder, decoder, dataloader, loss_fn, optimizer, config, verbose=True)

HBox(children=(IntProgress(value=0), HTML(value='')))

loss at 0 step: 10.094813



RuntimeError: leaf variable has been moved into the graph interior