In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data

import numpy as np

In [2]:
USE_CUDA = True

In [3]:
root = '/notebooks/sinica/dataset/'
train_data = root+'facial.train'
dev_data = root+'facial.dev'
test_data = root+'facial.test'

START_TAG = "<START>"
STOP_TAG = "<STOP>"
PAD_TAG = "<PAD>"
tag_to_ix = {START_TAG: 0, STOP_TAG: 1, PAD_TAG:2, "B-Func": 3, "I-Func": 4, "O": 5}

tagset_size = len(tag_to_ix)
MAX_LEN = 100
BATCH_SIZE = 128

EMBEDDING_DIM = 20
HIDDEN_DIM1 = 10
HIDDEN_DIM2 = 8
LABEL_EMBED_DIM = 3

In [4]:
def readfile(data):
    with open(data, "r", encoding="utf-8") as f:
        content = f.read().splitlines()
        
    return content

def get_word_and_label(_content, start_w, end_w):
    word_list = []
    tag_list = []
    for word_set in _content[start_w:end_w]:
        word_list.append(word_set[0])
        tag_list.append(word_set[2:])
    
    return word_list, tag_list

def split_to_list(content):
    init = 0
    word_list = []
    tag_list = []

    for i, c in enumerate(content):
        if c=='':
            words, tags = get_word_and_label(content, init, i)
            init = i+1
            word_list.append(words)
            tag_list.append(tags)
            
    return word_list, tag_list
    
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_all(seqs, to_ix):
    seq_list = []
    for i in range(len(seqs)):
        seq_list.append(prepare_sequence(seqs[i], to_ix))
        
    seq_list = torch.stack(seq_list)
        
    return seq_list

def word2index(word_list):
    word_to_ix = {"<START>":0, "<STOP>":1, "<PAD>":2}
    for sentence in word_list:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                
    return word_to_ix

def find_max_len(word_list):
    max_len = 0
    for i in range(len(word_list)):
        if max_len<len(word_list[i]):
            max_len=len(word_list[i])
            
    return max_len

def filter_len(word_list):
    reserved_index = []
    for i in range(len(word_list)):
        if len(word_list[i])<MAX_LEN:
            reserved_index.append(i)
            
    return reserved_index

def filter_sentence(reserved_index, word_list, tag_list):
    filter_word = list(word_list[i] for i in reserved_index)
    filter_tag = list(tag_list[i] for i in reserved_index)
    return filter_word, filter_tag

def pad_seq(seq):
    seq += [PAD_TAG for i in range(MAX_LEN-len(seq))]
    return seq

def pad_all(filter_word, filter_tag):
    input_padded = [pad_seq(s) for s in filter_word]
    target_padded = [pad_seq(s) for s in filter_tag]
    
    return input_padded, target_padded

#======================================
def dataload(input_var, target_var):
    torch_dataset = Data.TensorDataset(input_var, target_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,              
    )
    
    return loader

In [5]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim1, hidden_dim2, \
                 label_embed_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.label_embed_dim = label_embed_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim1 // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        
        self.lstm = nn.LSTM(hidden_dim1+label_embed_dim, hidden_dim2, batch_first=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim2, self.tagset_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.label_embed = nn.Linear(self.tagset_size, self.label_embed_dim)
        
        self.hidden1 = self.init_hidden1()
        self.hidden2 = self.init_hidden2()
        self.to_label_embed = self.init_label_embed()
        
        
    def init_hidden1(self):       
        hidden = torch.randn(2*2, BATCH_SIZE, self.hidden_dim1 // 2)        
        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_hidden2(self):       
        hidden = torch.randn(1, BATCH_SIZE, self.hidden_dim2)        
        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_label_embed(self):
        hidden = torch.randn(BATCH_SIZE, MAX_LEN, self.label_embed_dim)
        return hidden.cuda()if USE_CUDA else hidden
        
    def forward(self, sentence):
        self.hidden1 = self.init_hidden1()
        self.hidden2 = self.init_hidden2()
        embeds = self.word_embeds(sentence)
#         print(embeds.size())
#         print()
        
        bilstm_out, self.hidden1 = self.bilstm(embeds, self.hidden1)
#         print(bilstm_out.size())
#         print(self.hidden1[0].size())
#         print()
        
        combine_lstm = torch.cat((bilstm_out, self.to_label_embed), 2)
        
        lstm_out, self.hidden2 = self.lstm(combine_lstm, self.hidden2)
#         print(lstm_out.size())
#         print(self.hidden2[0].size())
#         print()
        
        to_tags = self.hidden2tag(lstm_out)
#         print(to_tags.size())

        output = self.softmax(to_tags)
#         print(output.size())
#         print()
        
        self.to_label_embed = self.label_embed(output)
#         print(self.to_label_embed.size())
        
        return output

In [6]:
content = readfile(train_data)

In [7]:
word_list, tag_list = split_to_list(content)

In [8]:
word_to_ix = word2index(word_list)

In [9]:
prepare_sequence(word_list[0], word_to_ix)

tensor([  3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,   3,   4,   5,   6,  20,   3,  21,
         22,  23,  24,  25,  26,  27,  28,   7,  29])

In [10]:
max_len = find_max_len(word_list)

In [11]:
reserved_index = filter_len(word_list)

In [12]:
filter_word, filter_tag = filter_sentence(reserved_index, word_list, tag_list)

In [13]:
input_padded, target_padded = pad_all(filter_word, filter_tag)

In [14]:
input_var = prepare_all(input_padded, word_to_ix)
target_var = prepare_all(target_padded, tag_to_ix)

In [15]:
vocab_size = len(word_to_ix)
vocab_size

2516

In [16]:
loader = dataload(input_var, target_var)

In [17]:
model = BiLSTM(vocab_size, tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, \
              LABEL_EMBED_DIM).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.NLLLoss()

In [18]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [23]:
n_iters = 10
print_every = 512
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for epoch in range(10):  
    for step, (batch_x, batch_y) in enumerate(loader):
        model.zero_grad()
        output = model(batch_x.cuda())
        print(output.size())
        print(batch_y.size())
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        
        if step % print_every == 0:
            print('%.4f| epoch: %d| step: %d| %s' % (loss, epoch, step, timeSince(start)))

torch.Size([128, 100, 6])
torch.Size([128, 100])


ValueError: Expected target size (128, 6), got torch.Size([128, 100])