In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
from tqdm import tqdm
import numpy as np

In [2]:
USE_CUDA = True

In [3]:
root = '/notebooks/sinica/dataset/'
train_data = root+'facial.train'
dev_data = root+'facial.dev'
test_data = root+'facial.test'

START_TAG = "<START>"
STOP_TAG = "<STOP>"
PAD_TAG = "<PAD>"
tag_to_ix = {START_TAG: 0, STOP_TAG: 1, PAD_TAG:2, "B-Func": 3, "I-Func": 4, "O": 5}

tagset_size = len(tag_to_ix)
MAX_LEN = 100
BATCH_SIZE = 128

EMBEDDING_DIM = 20
HIDDEN_DIM1 = 10
HIDDEN_DIM2 = 8
LABEL_EMBED_DIM = 3
DENSE_OUT = 100

In [4]:
def readfile(data):
    with open(data, "r", encoding="utf-8") as f:
        content = f.read().splitlines()
        
    return content

def get_word_and_label(_content, start_w, end_w):
    word_list = []
    tag_list = []
    for word_set in _content[start_w:end_w]:
        word_list.append(word_set[0])
        tag_list.append(word_set[2:])
    
    return word_list, tag_list

def split_to_list(content):
    init = 0
    word_list = []
    tag_list = []

    for i, c in enumerate(content):
        if c=='':
            words, tags = get_word_and_label(content, init, i)
            init = i+1
            word_list.append(words)
            tag_list.append(tags)
            
    return word_list, tag_list
    
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_all(seqs, to_ix):
    seq_list = []
    for i in range(len(seqs)):
        seq_list.append(prepare_sequence(seqs[i], to_ix))
        
    seq_list = torch.stack(seq_list)
        
    return seq_list

def word2index(word_list):
    word_to_ix = {"<START>":0, "<STOP>":1, "<PAD>":2}
    for sentence in word_list:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                
    return word_to_ix

def find_max_len(word_list):
    max_len = 0
    for i in range(len(word_list)):
        if max_len<len(word_list[i]):
            max_len=len(word_list[i])
            
    return max_len

def filter_len(word_list):
    reserved_index = []
    for i in range(len(word_list)):
        if len(word_list[i])<MAX_LEN:
            reserved_index.append(i)
            
    return reserved_index

def filter_sentence(reserved_index, word_list, tag_list):
    filter_word = list(word_list[i] for i in reserved_index)
    filter_tag = list(tag_list[i] for i in reserved_index)
    return filter_word, filter_tag

def pad_seq(seq):
    seq += [PAD_TAG for i in range(MAX_LEN-len(seq))]
    return seq

def pad_all(filter_word, filter_tag):
    input_padded = [pad_seq(s) for s in filter_word]
    target_padded = [pad_seq(s) for s in filter_tag]
    
    return input_padded, target_padded

#======================================
def dataload(input_var, target_var):
    torch_dataset = Data.TensorDataset(input_var, target_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,       
        drop_last=True
    )
    
    return loader

In [5]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim1, hidden_dim2, \
                 label_embed_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.label_embed_dim = label_embed_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim1 // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        
        self.dense = nn.Linear(hidden_dim1, DENSE_OUT)
        
        self.lstm = nn.LSTM(DENSE_OUT+label_embed_dim, hidden_dim2, batch_first=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim2, self.tagset_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.label_embed = nn.Linear(self.tagset_size, self.label_embed_dim)
        
#         self.hidden1 = self.init_hidden1()
#         self.hidden2 = self.init_hidden2()
#         self.to_label_embed = self.init_label_embed()
        
        
    def init_hidden1(self):       
        hidden = torch.randn(2*2, BATCH_SIZE, self.hidden_dim1 // 2)   
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_hidden2(self):       
        hidden = torch.randn(1, BATCH_SIZE, self.hidden_dim2)        
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_label_embed(self):
        hidden = torch.randn(BATCH_SIZE, MAX_LEN, self.label_embed_dim)
        return hidden.cuda()if USE_CUDA else hidden
        
    def forward(self, sentence):
        self.hidden1 = self.init_hidden1()
        self.hidden2 = self.init_hidden2()
        self.to_label_embed = self.init_label_embed()
        
        embeds = self.word_embeds(sentence)
#         print(embeds.size())
#         print()
        
        bilstm_out, self.hidden1 = self.bilstm(embeds, self.hidden1)
#         print(bilstm_out.size())
#         print(self.hidden1[0].size())
#         print()

        dense_out = self.dense(bilstm_out)
        
#         combine_lstm = torch.cat((dense_out, to_label_embed), 2)
        combine_lstm = torch.cat((dense_out, self.to_label_embed), 2)
        
        lstm_out, self.hidden2 = self.lstm(combine_lstm, self.hidden2)
#         print(lstm_out.size())
#         print(self.hidden2[0].size())
#         print()
        
        to_tags = self.hidden2tag(lstm_out)
#         print(to_tags.size())

        output = self.softmax(to_tags)
#         print(output.size())
#         print()
        
        self.to_label_embed = self.label_embed(output)
#         print(self.to_label_embed.size())
        
        return output.view(BATCH_SIZE*MAX_LEN, self.tagset_size)
#         return output.view(BATCH_SIZE, self.tagset_size, MAX_LEN)
#         return output

In [6]:
content = readfile(train_data)

In [7]:
word_list, tag_list = split_to_list(content)

In [8]:
word_to_ix = word2index(word_list)

In [9]:
prepare_sequence(word_list[0], word_to_ix)

tensor([  3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,   3,   4,   5,   6,  20,   3,  21,
         22,  23,  24,  25,  26,  27,  28,   7,  29])

In [10]:
max_len = find_max_len(word_list)

In [11]:
reserved_index = filter_len(word_list)

In [12]:
filter_word, filter_tag = filter_sentence(reserved_index, word_list, tag_list)

In [13]:
input_padded, target_padded = pad_all(filter_word, filter_tag)

In [14]:
input_var = prepare_all(input_padded, word_to_ix)
target_var = prepare_all(target_padded, tag_to_ix)

In [15]:
len(input_var)

8917

In [16]:
vocab_size = len(word_to_ix)
vocab_size

2516

In [17]:
loader = dataload(input_var, target_var)

In [18]:
model = BiLSTM(vocab_size, tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, \
              LABEL_EMBED_DIM).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.NLLLoss()

In [19]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [20]:
n_iters = 10
print_every = 10
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for epoch in tqdm(range(50)):  
    for step, (batch_x, batch_y) in enumerate(loader):
        optimizer.zero_grad()
        output = model(batch_x.cuda())
#         print(output.size())
#         print(batch_y.size())
#         break
        batch_y = batch_y.view(BATCH_SIZE*MAX_LEN)
        loss = criterion(output, batch_y.cuda())
        loss.backward()
#         loss.backward(retain_graph=True)
        optimizer.step()
        
        #if step % print_every == 0:
        #    print('%.4f| epoch: %d| step: %d| %s' % (loss, epoch, step, timeSince(start)))
    print("epoch: %d | loss %.4f"%(epoch,loss))
#     break

  2%|▏         | 1/50 [00:02<02:10,  2.66s/it]

epoch: 0 | loss 0.0860


  4%|▍         | 2/50 [00:05<02:18,  2.90s/it]

epoch: 1 | loss 0.0699


  6%|▌         | 3/50 [00:08<02:15,  2.88s/it]

epoch: 2 | loss 0.0319


  8%|▊         | 4/50 [00:11<02:11,  2.86s/it]

epoch: 3 | loss 0.0282


 10%|█         | 5/50 [00:14<02:07,  2.83s/it]

epoch: 4 | loss 0.0265


 12%|█▏        | 6/50 [00:16<02:01,  2.77s/it]

epoch: 5 | loss 0.0217


 14%|█▍        | 7/50 [00:19<01:59,  2.77s/it]

epoch: 6 | loss 0.0190


 16%|█▌        | 8/50 [00:21<01:55,  2.75s/it]

epoch: 7 | loss 0.0200


 18%|█▊        | 9/50 [00:24<01:52,  2.74s/it]

epoch: 8 | loss 0.0169


 20%|██        | 10/50 [00:27<01:51,  2.78s/it]

epoch: 9 | loss 0.0170


 22%|██▏       | 11/50 [00:30<01:48,  2.78s/it]

epoch: 10 | loss 0.0174


 24%|██▍       | 12/50 [00:33<01:44,  2.75s/it]

epoch: 11 | loss 0.0168


 26%|██▌       | 13/50 [00:35<01:42,  2.77s/it]

epoch: 12 | loss 0.0186


 28%|██▊       | 14/50 [00:38<01:39,  2.76s/it]

epoch: 13 | loss 0.0177


 30%|███       | 15/50 [00:41<01:36,  2.77s/it]

epoch: 14 | loss 0.0162


 32%|███▏      | 16/50 [00:44<01:33,  2.76s/it]

epoch: 15 | loss 0.0165


 34%|███▍      | 17/50 [00:46<01:30,  2.76s/it]

epoch: 16 | loss 0.0166


 36%|███▌      | 18/50 [00:49<01:27,  2.72s/it]

epoch: 17 | loss 0.0150


 38%|███▊      | 19/50 [00:51<01:23,  2.69s/it]

epoch: 18 | loss 0.0152


 40%|████      | 20/50 [00:53<01:19,  2.66s/it]

epoch: 19 | loss 0.0150


 42%|████▏     | 21/50 [00:55<01:17,  2.66s/it]

epoch: 20 | loss 0.0141


 44%|████▍     | 22/50 [00:58<01:14,  2.66s/it]

epoch: 21 | loss 0.0153


 46%|████▌     | 23/50 [01:00<01:10,  2.62s/it]

epoch: 22 | loss 0.0165


 48%|████▊     | 24/50 [01:02<01:07,  2.59s/it]

epoch: 23 | loss 0.0139


 50%|█████     | 25/50 [01:05<01:05,  2.60s/it]

epoch: 24 | loss 0.0158


 52%|█████▏    | 26/50 [01:07<01:02,  2.59s/it]

epoch: 25 | loss 0.0148


 54%|█████▍    | 27/50 [01:10<00:59,  2.60s/it]

epoch: 26 | loss 0.0135


 56%|█████▌    | 28/50 [01:13<00:57,  2.61s/it]

epoch: 27 | loss 0.0134


 58%|█████▊    | 29/50 [01:16<00:55,  2.63s/it]

epoch: 28 | loss 0.0159


 60%|██████    | 30/50 [01:18<00:52,  2.63s/it]

epoch: 29 | loss 0.0147


 62%|██████▏   | 31/50 [01:21<00:50,  2.64s/it]

epoch: 30 | loss 0.0137


 64%|██████▍   | 32/50 [01:24<00:47,  2.65s/it]

epoch: 31 | loss 0.0157


 66%|██████▌   | 33/50 [01:27<00:45,  2.66s/it]

epoch: 32 | loss 0.0157


 68%|██████▊   | 34/50 [01:29<00:42,  2.65s/it]

epoch: 33 | loss 0.0153


 70%|███████   | 35/50 [01:33<00:39,  2.66s/it]

epoch: 34 | loss 0.0144


 72%|███████▏  | 36/50 [01:35<00:37,  2.66s/it]

epoch: 35 | loss 0.0167


 74%|███████▍  | 37/50 [01:38<00:34,  2.67s/it]

epoch: 36 | loss 0.0153


 76%|███████▌  | 38/50 [01:41<00:32,  2.68s/it]

epoch: 37 | loss 0.0165


 78%|███████▊  | 39/50 [01:45<00:29,  2.69s/it]

epoch: 38 | loss 0.0151


 80%|████████  | 40/50 [01:48<00:27,  2.70s/it]

epoch: 39 | loss 0.0138


 82%|████████▏ | 41/50 [01:51<00:24,  2.71s/it]

epoch: 40 | loss 0.0141


 84%|████████▍ | 42/50 [01:54<00:21,  2.72s/it]

epoch: 41 | loss 0.0141


 86%|████████▌ | 43/50 [01:57<00:19,  2.72s/it]

epoch: 42 | loss 0.0151


 88%|████████▊ | 44/50 [02:00<00:16,  2.73s/it]

epoch: 43 | loss 0.0146


 90%|█████████ | 45/50 [02:03<00:13,  2.74s/it]

epoch: 44 | loss 0.0131


 92%|█████████▏| 46/50 [02:06<00:11,  2.75s/it]

epoch: 45 | loss 0.0146


 94%|█████████▍| 47/50 [02:09<00:08,  2.76s/it]

epoch: 46 | loss 0.0140


 96%|█████████▌| 48/50 [02:12<00:05,  2.76s/it]

epoch: 47 | loss 0.0140


 98%|█████████▊| 49/50 [02:15<00:02,  2.77s/it]

epoch: 48 | loss 0.0139


100%|██████████| 50/50 [02:18<00:00,  2.78s/it]

epoch: 49 | loss 0.0150





In [21]:
 
for step, (batch_x, batch_y) in enumerate(loader):
    optimizer.zero_grad()
    output = model(batch_x.cuda())
    print(output.size())
    print(batch_y.size())
    break

torch.Size([12800, 6])
torch.Size([128, 100])
