## ===========LSTM for POS==============

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x271d04f0990>

In [3]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [4]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

In [18]:
word_to_ix = {}

for st, tar in training_data:
    for wd in st:
        if wd not in word_to_ix:
            word_to_ix[wd] = len(word_to_ix)

print(word_to_ix)

tag_to_ix = {'DET': 0, 'NN': 1, 'V': 2}

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


#### model

In [6]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [14]:
class LSTMTagger(nn.Module):
    def __init__(self, hidden_size, embedding_size, vocab_size, target_size):
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.hidden2tag = nn.Linear(hidden_size, target_size)
        self.hidden = self.initHidden()
        
    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                   torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embed = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embed.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_score = F.log_softmax(tag_space, dim=1)
        
        return tag_score
        

In [15]:
model = LSTMTagger(HIDDEN_DIM, EMBEDDING_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

#### before training

In [16]:
with torch.no_grad():
    input_ = prepare_sequence(training_data[0][0], word_to_ix)
    score = model(input_)
    print(score)

tensor([[-0.8214, -1.4441, -1.1264],
        [-0.8110, -1.4408, -1.1430],
        [-0.7254, -1.5179, -1.2150],
        [-0.7427, -1.5127, -1.1911],
        [-0.8500, -1.4378, -1.0932]])


#### training

In [19]:
for epoch in range(300):
    for sentence, tag in training_data:
        model.zero_grad()
        
        model.hidden = model.initHidden()
        
        input_ = prepare_sequence(sentence, word_to_ix)
        tag_score = model(input_)
        target_ = prepare_sequence(tag, tag_to_ix)
        
        loss = loss_function(tag_score, target_)
        loss.backward()
        optimizer.step()

#### after training

In [21]:
#("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"])

In [20]:
with torch.no_grad():
    input_ = prepare_sequence(training_data[0][0], word_to_ix)
    score = model(input_)
    print(score)

tensor([[-0.3051, -1.4199, -3.8538],
        [-4.4802, -0.0415, -3.5289],
        [-3.4153, -3.3745, -0.0695],
        [-0.0492, -3.4455, -4.1279],
        [-4.5835, -0.0167, -5.0532]])


## ADVANCED: MAKING DYNAMIC DECISIONS AND THE BI-LSTM CRF

In [95]:
import numpy as np

In [4]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1dce5065c50>

In [5]:
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()

In [6]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [58]:
#log_sum_exp in a stable way
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
    

In [164]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, hidden_dim, vocab_size, embedding_dim, tag_to_ix):
        super(BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.tag_size = len(tag_to_ix)
        self.embedding_dim = embedding_dim
        self.tag_to_ix = tag_to_ix
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
        
        self.hidden2tag = nn.Linear(hidden_dim, self.tag_size)
        
        #transition matrix
        self.transition = nn.Parameter(torch.randn(self.tag_size, self.tag_size))
        
        self.transition.data[tag_to_ix['START_TAG'], :] = -10000
        self.transition.data[:, tag_to_ix['END_TAG']] = -10000
        
        self.hidden = self.initHidden()
        
    def initHidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2))
    
    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tag_size), -10000)
        init_alphas[0][self.tag_to_ix['START_TAG']] = 0
        
        forward_var = init_alphas
        
        for feat in feats:
            forward_t = []
            
            for next_tag in range(self.tag_size):
                emit_score = feat[next_tag].view(1, -1)
                emit_socre_broadcast = emit_score.expand(1, self.tag_size)
                
                trans_score = self.transition[next_tag].view(1, -1)
                
                next_for_var = forward_var + trans_score + emit_socre_broadcast
                
                forward_t.append(log_sum_exp(next_for_var).view(1))
            
            forward_var = torch.cat(forward_t).view(1, -1)
        
        terminal_var = forward_var + self.transition[self.tag_to_ix['END_TAG']]      
        terminal_score = log_sum_exp(terminal_var)
            
        return terminal_score
        
    def _get_lstm_features(self, sentence):
        self.hidden = self.initHidden()
        word_embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        output, self.hidden = self.lstm(word_embeds, self.hidden)
        output = output.view(len(sentence), self.hidden_dim)
        feats = self.hidden2tag(output)
        
        return feats
    
    def _viterbi_decode(self, feats):
        back_pointer = []
        
        init_vvars = torch.full((1, self.tag_size), -10000)
        init_vvars[0][self.tag_to_ix['START_TAG']] = 0
        
        forward_var = init_vvars
        for feat in feats:
            cur_var = []
            cur_back = []
            for next_tag in range(self.tag_size):
                trans_score = self.transition[next_tag]
                next_tag_var = forward_var + trans_score
                best_tag_id = argmax(next_tag_var)
                cur_back.append(best_tag_id)
                cur_var.append(next_tag_var[0, best_tag_id].view(1))
            
            back_pointer.append(cur_back)
            forward_var = (torch.cat(cur_var) + feat).view(1, -1)
            
        terminal_var = forward_var + self.transition[self.tag_to_ix['END_TAG']]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0, best_tag_id].view(1)
        
        best_path = [best_tag_id]
        
        for cur_back in reversed(back_pointer):
            #print(best_tag_id)
            best_tag_id = cur_back[best_tag_id]
            best_path.append(best_tag_id)
                    
        start = best_path.pop()
        assert start == self.tag_to_ix['START_TAG']
        best_path.reverse()
        
        return path_score, best_path
        

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1)     
        tags = torch.cat([torch.tensor([self.tag_to_ix['START_TAG']], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + self.transition[tags[i+1], tags[i]].view(1) + feat[tags[i+1]].view(1)
        
        score += self.transition[self.tag_to_ix['END_TAG'], tags[-1]].view(1)
        
        return score
    
    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score
    
    def neg_log_likelihood2(self, sentence, tags):
        lstm_feats = self._get_lstm_features(sentence)
        decode_score, decode_tags = self._viterbi_decode(lstm_feats)
        gold_score = self._score_sentence(lstm_feats, tags)

        same_len = sum([1 if i == j else 0 for i, j in zip(tags, decode_tags)])
        #if same_len == len(tags):
            #return torch.tensor([0], dtype=torch.long)
        loss_p1 = max((20 - gold_score), 0.1)
        loss_p2 = gold_score - (gold_score / len(tags) * same_len)
        loss_p3 = decode_score - (decode_score / len(tags) * same_len)
        loss = loss_p1 + loss_p2 + loss_p3
        return loss
    
    def forward(self, sentence):
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq
        

In [112]:
START_TAG = "START_TAG"
END_TAG = "END_TAG"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# Make up some training data
training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

In [107]:
word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B": 0, "I": 1, "O": 2, "START_TAG": 3, "END_TAG": 4}

In [165]:
model = BiLSTM_CRF(HIDDEN_DIM, len(word_to_ix), EMBEDDING_DIM, tag_to_ix)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [89]:
with torch.no_grad():
    precheck_sentence = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tag_seq = [tag_to_ix[tag] for tag in training_data[0][1]]
    print(precheck_tag_seq)
    print(model(precheck_sentence))

[0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2]
(tensor([20.9057]), [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


In [168]:
#use neg_log_likelihood(use the forward alg)

In [155]:
for epoch in range(100):
    
    for sentence, tags in training_data:
        model.zero_grad()
        model.initHidden()
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        loss = model.neg_log_likelihood(sentence_in, targets)
        print(loss)
        loss.backward()
        optimizer.step()
    
with torch.no_grad():
    for sentence, tags in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        print('tagets : ', targets)
        print('pre : ', model(sentence_in))
        print('================')

tensor([13.8975], grad_fn=<ThSubBackward>)
tensor([9.0473], grad_fn=<ThSubBackward>)
tensor([13.3586], grad_fn=<ThSubBackward>)
tensor([8.8260], grad_fn=<ThSubBackward>)
tensor([13.0316], grad_fn=<ThSubBackward>)
tensor([8.5113], grad_fn=<ThSubBackward>)
tensor([12.7835], grad_fn=<ThSubBackward>)
tensor([8.1712], grad_fn=<ThSubBackward>)
tensor([12.3300], grad_fn=<ThSubBackward>)
tensor([7.9499], grad_fn=<ThSubBackward>)
tensor([12.0882], grad_fn=<ThSubBackward>)
tensor([7.7376], grad_fn=<ThSubBackward>)
tensor([11.6955], grad_fn=<ThSubBackward>)
tensor([7.4244], grad_fn=<ThSubBackward>)
tensor([11.4362], grad_fn=<ThSubBackward>)
tensor([7.3769], grad_fn=<ThSubBackward>)
tensor([11.2459], grad_fn=<ThSubBackward>)
tensor([7.1496], grad_fn=<ThSubBackward>)
tensor([11.0312], grad_fn=<ThSubBackward>)
tensor([7.0471], grad_fn=<ThSubBackward>)
tensor([10.8381], grad_fn=<ThSubBackward>)
tensor([6.8703], grad_fn=<ThSubBackward>)
tensor([10.6708], grad_fn=<ThSubBackward>)
tensor([6.8352], grad_

tensor([5.7185], grad_fn=<ThSubBackward>)
tensor([3.2895], grad_fn=<ThSubBackward>)
tagets :  tensor([0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])
pre :  (tensor([18.8147]), [0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tagets :  tensor([0, 1, 2, 2, 2, 2, 0])
pre :  (tensor([11.4065]), [0, 1, 2, 2, 2, 2, 2])


In [167]:
#use neg_log_likelihood2 (cal the loss use the viterbe decode path)

In [166]:
for epoch in range(150):
    
    for sentence, tags in training_data:
        model.zero_grad()
        model.initHidden()
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        loss = model.neg_log_likelihood2(sentence_in, targets)
        print(loss)
        loss.backward()
        optimizer.step()
    
with torch.no_grad():
    for sentence, tags in training_data:
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        print('tagets : ', targets)
        print('pre : ', model(sentence_in))
        print('================')

tensor([27.3779], grad_fn=<ThAddBackward>)
tensor([18.9753], grad_fn=<ThAddBackward>)
tensor([27.1541], grad_fn=<ThAddBackward>)
tensor([18.9429], grad_fn=<ThAddBackward>)
tensor([26.5656], grad_fn=<ThAddBackward>)
tensor([19.0123], grad_fn=<ThAddBackward>)
tensor([26.2677], grad_fn=<ThAddBackward>)
tensor([19.0848], grad_fn=<ThAddBackward>)
tensor([25.0882], grad_fn=<ThAddBackward>)
tensor([17.6480], grad_fn=<ThAddBackward>)
tensor([24.8263], grad_fn=<ThAddBackward>)
tensor([17.7330], grad_fn=<ThAddBackward>)
tensor([24.6369], grad_fn=<ThAddBackward>)
tensor([17.5252], grad_fn=<ThAddBackward>)
tensor([24.5979], grad_fn=<ThAddBackward>)
tensor([17.5009], grad_fn=<ThAddBackward>)
tensor([24.2695], grad_fn=<ThAddBackward>)
tensor([17.3153], grad_fn=<ThAddBackward>)
tensor([24.2258], grad_fn=<ThAddBackward>)
tensor([17.3741], grad_fn=<ThAddBackward>)
tensor([24.0904], grad_fn=<ThAddBackward>)
tensor([17.1789], grad_fn=<ThAddBackward>)
tensor([23.8908], grad_fn=<ThAddBackward>)
tensor([17.

tensor([1.3206], grad_fn=<ThAddBackward>)
tensor([36.8437], grad_fn=<ThAddBackward>)
tensor([0.9479], grad_fn=<ThAddBackward>)
tensor([26.2802], grad_fn=<ThAddBackward>)
tensor([0.8124], grad_fn=<ThAddBackward>)
tensor([33.6020], grad_fn=<ThAddBackward>)
tensor([0.8329], grad_fn=<ThAddBackward>)
tensor([22.4055], grad_fn=<ThAddBackward>)
tensor([0.4276], grad_fn=<ThAddBackward>)
tensor([26.0532], grad_fn=<ThAddBackward>)
tensor([0.3752], grad_fn=<ThAddBackward>)
tensor([26.1521], grad_fn=<ThAddBackward>)
tensor([0.1000], grad_fn=<ThAddBackward>)
tensor([36.8111], grad_fn=<ThAddBackward>)
tensor([0.1000], grad_fn=<ThAddBackward>)
tensor([22.2614], grad_fn=<ThAddBackward>)
tensor([0.1000], grad_fn=<ThAddBackward>)
tensor([25.7760], grad_fn=<ThAddBackward>)
tensor([0.1208], grad_fn=<ThAddBackward>)
tensor([29.5267], grad_fn=<ThAddBackward>)
tensor([0.1000], grad_fn=<ThAddBackward>)
tensor([15.3543], grad_fn=<ThAddBackward>)
tensor([0.1000], grad_fn=<ThAddBackward>)
tensor([18.8115], grad_