In [1]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x29b3e5d5350>

In [92]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [10]:
import numpy as np
import math
from keras.utils import to_categorical
import json
np.random.seed(1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
with open("debugging/atis.json", "r") as f:
    data = json.load(f)

train_dev_sents = data["train_sents"] # list of lists
train_dev_labels = data["train_labels"] # list of lists
num_train = math.floor(0.8 * len(train_dev_sents))
train_sents = train_dev_sents[:num_train]
train_labels = train_dev_labels[:num_train]
dev_sents = train_dev_sents[num_train:]
dev_labels = train_dev_labels[num_train:]
test_sents = data["test_sents"]
test_labels = data["test_labels"]
word_to_id = data["vocab"]
label_to_id = data["label_dict"]

In [12]:
train_dev_sents[0]

[232,
 542,
 502,
 196,
 208,
 77,
 62,
 10,
 35,
 40,
 58,
 234,
 137,
 62,
 11,
 234,
 481,
 321]

In [178]:
UNK_TOKEN = "<UNK>"
PAD_TOKEN = "<PAD>"
VOCAB_SIZE = len(word_to_id)
NUM_LABELS = len(label_to_id)
EMBEDDING_SIZE = 50
HIDDEN_SIZE=50
MAX_LENGTH=100

In [179]:
print(word_to_id[UNK_TOKEN])
print(word_to_id[PAD_TOKEN])
print(label_to_id["O"])

7
572
126


In [15]:
id_to_word = {word:id_ for id_, word in word_to_id.items()}
id_to_label = {label:id_ for id_, label in label_to_id.items()}

for i in train_dev_sents[0]: print(id_to_word[i])

i
want
to
fly
from
boston
at
DIGITDIGITDIGIT
am
and
arrive
in
denver
at
DIGITDIGITDIGITDIGIT
in
the
morning


In [16]:
def prepare_sequence(seq): #, to_ix):
    idxs = seq #[to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [100]:
from tqdm import tqdm

EMBEDDING_DIM = 25
HIDDEN_DIM = 25

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [101]:

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
#with torch.no_grad():
#    inputs = prepare_sequence(training_data[0][0])#, word_to_ix)
#    tag_scores = model(inputs)
#    print(tag_scores)
n_total = len(train_dev_sents)
for epoch in range(25):  # again, normally you would NOT do 300 epochs, it is toy data
    accum_loss = 0
    for sentence, tags in tqdm(zip(train_dev_sents, train_dev_labels), total=n_total):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence)#, word_to_ix)
        targets = prepare_sequence(tags)#, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        
        accum_loss += loss
        loss.backward()
        optimizer.step()
    print(accum_loss / n_total)

# See what the scores are after training
#with torch.no_grad():
#    inputs = prepare_sequence(training_data[0][0])#, word_to_ix)
#    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
#    print(tag_scores)

100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 221.61it/s]


tensor(0.8641, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:23<00:00, 216.17it/s]


tensor(0.3848, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:23<00:00, 212.82it/s]


tensor(0.2673, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:30<00:00, 163.56it/s]


tensor(0.2068, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:26<00:00, 188.37it/s]


tensor(0.1692, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:28<00:00, 177.03it/s]


tensor(0.1437, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:29<00:00, 168.43it/s]


tensor(0.1253, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 195.81it/s]


tensor(0.1105, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:24<00:00, 191.96it/s]


tensor(0.0992, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 192.49it/s]


tensor(0.0893, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:24<00:00, 203.84it/s]


tensor(0.0816, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:23<00:00, 211.51it/s]


tensor(0.0752, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:27<00:00, 183.71it/s]


tensor(0.0695, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:27<00:00, 184.07it/s]


tensor(0.0646, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 197.45it/s]


tensor(0.0602, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:28<00:00, 172.35it/s]


tensor(0.0564, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 193.73it/s]


tensor(0.0530, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:29<00:00, 168.08it/s]


tensor(0.0500, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:28<00:00, 174.53it/s]


tensor(0.0477, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:25<00:00, 194.22it/s]


tensor(0.0457, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:23<00:00, 207.92it/s]


tensor(0.0444, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:23<00:00, 209.37it/s]


tensor(0.0408, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:34<00:00, 145.27it/s]


tensor(0.0389, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:33<00:00, 149.79it/s]


tensor(0.0365, grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████████████████████████| 4978/4978 [00:32<00:00, 157.24it/s]


tensor(0.0345, grad_fn=<DivBackward0>)


In [103]:
# After 5 epochs: tensor(0.1692, grad_fn=<DivBackward0>)
# After 25 epochs: tensor(0.0345, grad_fn=<DivBackward0>)


In [104]:
correct = 0
total = 0
for sentence, tags in zip(test_sents, test_labels):
    sentence_in = prepare_sequence(sentence)#, word_to_ix)
    targets = prepare_sequence(tags)#, tag_to_ix)
    tag_scores = model(sentence_in)
    correct += (tag_scores.argmax(-1) == targets).sum()
    total += len(targets)
    
print(float(correct) / float(total))
    

0.9395520765383779


In [None]:
# 0.9395520765383779

In [217]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)

In [218]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [219]:
def multi_meshgrid(*args):
    """
    Creates a meshgrid from possibly many
    elements (instead of only 2).
    Returns a nd tensor with as many dimensions
    as there are arguments
    """
    args = list(args)
    template = [1 for _ in args]
    for i in range(len(args)):
        n = args[i].shape[0]
        template_copy = template.copy()
        template_copy[i] = n
        args[i] = args[i].view(*template_copy)
        # there will be some broadcast magic going on
    return tuple(args)

def flip(tensor, dims):
    """
    This function should be in native PyTorch hopefully after 0.4
    :param tensor:
    :param dims:
    :return:
    """
    if not isinstance(dims, (tuple, list)):
        dims = [dims]
    indices = [torch.arange(tensor.shape[dim] - 1, -1, -1,
                                        dtype=torch.long, device="cuda") for dim in dims]
    multi_indices = multi_meshgrid(*indices)
    final_indices = [slice(i) for i in tensor.shape]
    for i, dim in enumerate(dims):
        final_indices[dim] = multi_indices[i]
    flipped = tensor[final_indices]
    return flipped

def stripe(a, left_top=True): # NON-BATCH version
    """
    Get a diagonal stripe of a matrix m x n, where n > m
    this implementation also takes into account batched matrices,
    so the stripe is calculated over a batch x for a matrix of size[x, m, n]
    left_top: whether stripe goes from left top to right bottom 
    (otherwise stripe from right top to left bottom is returned)
    """
    if not left_top:
        a = flip(a, -1)
    head, i, j = a.size()

    assert i <= j
    head_s, k, l = a.stride()
    # left top to right bottom
    new_shape = (head, i, i)
    steps = (head_s, k+l, l)
    result = torch.as_strided(a, new_shape, steps)
    if not left_top:
        result = flip(result, -1)
    return result

In [220]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout, n_head): # size == d_model
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.size = size
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.rel_pos_embeddings = nn.Embedding((MAX_LENGTH*2)-1, size//n_head)#, padding_idx=PAD)
        assert self.rel_pos_embeddings.weight.requires_grad # TODO: necessary to set?

    def forward(self, x):
        "Follow Figure 1 (left) for connections."
        length = x.size(-2)
        pos_indices = torch.tensor(range(MAX_LENGTH - length, MAX_LENGTH + length - 1), dtype=torch.long)
        #np.arange(MAX_LENGTH - length, MAX_LENGTH + length - 1)
        position = self.rel_pos_embeddings(pos_indices)
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, position))
        return self.sublayer[1](x, self.feed_forward)

attn_lists_0 = [[] for _ in range(10)]
attn_lists_1 = [[] for _ in range(10)]
attn_lists_2 = [[] for _ in range(10)]
attn_lists_3 = [[] for _ in range(10)]

def attention(query, key, value, rel_pos_vecs, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    # query/key/value.shape: head x position x hidden_size
    d_k = query.size(-1)
    # head x position x hidden_size *matmul* head x hidden_size x position => head x query_position x key_position
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        
    pos_query = query # TODO: make additional mapping for pos_query (as for query/key/value) ...
    # ... or for the position embeddding so that it depends on the head
    
    # rel_pos_vecs: position encodings for dot product attention
    # broadcastable/head x positions_times_2-1  x hidden_size
    assert rel_pos_vecs.size(-2) == query.size(-2) * 2 - 1 # TODO: was there a wrong size in the original implementation?
    assert rel_pos_vecs.size(-3) == 1 or rel_pos_vecs.size(-3) == query.size(0) # first dimension ...
    #... of relative position representation broadcastable (=same) for each head, or different for each head
    
    # head x position x hidden_size *matmul* broadcastable/head x hidden_size x relative_positions
    # => head x query_position x relative_positions
    # Here, relative positions are not yet shifted -> relative_positions = 2*key_positions-1
    rel_pos_scores = torch.matmul(pos_query, rel_pos_vecs.transpose(-2, -1)) / math.sqrt(d_k)

    # get (inverse-)diagonal stripe (from right top to left bottom)
    rel_pos_scores = stripe(rel_pos_scores, left_top=False)
    # Now relative positions are shifted w.r.t. the query.
    # Resulting shape: heads x query_positions x key_positions
    assert rel_pos_scores.size(-1) == rel_pos_scores.size(-2)
    assert rel_pos_scores.size(-1) == query.size(-2)
    
    position_distribution = F.softmax(rel_pos_scores, dim = -1)
    
    scores = scores + rel_pos_scores
    p_attn = F.softmax(scores, dim = -1) # for each query normalize over key positions
    
    if query.size(1)> 2:
        for i in range(min(10, query.size(1))):
            attn_lists_0[i].append(float(position_distribution[0,2,i])) # head 0, query position 2, i-th key position
            attn_lists_1[i].append(float(position_distribution[1,2,i])) # head 1, query position 2, i-th key position
            attn_lists_2[i].append(float(position_distribution[2,2,i])) # head 2, query position 2, i-th key position
            attn_lists_3[i].append(float(position_distribution[3,2,i])) # head 3, query position 2, i-th key position

    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [221]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, position):
        "Implements Figure 2"
        # TODO: what type is query/key/value? => whole sequence of input embeddings.
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(-1, self.h, self.d_k).transpose(0, 1)
             for l, x in zip(self.linears, (query, key, value))]
        
        # head x position x hidden_size
        position = position.unsqueeze(0)
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, position, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(0, 1).contiguous() \
             .view(-1, self.h * self.d_k)
        return self.linears[-1](x)

In [222]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=MAX_LENGTH):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        
        print(position)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        print(div_term)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        #pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:x.size(0)], 
                         requires_grad=False)
        return self.dropout(x)

In [223]:
class Tagger(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, src_embed, hidden_size, nb_tags):
        super(Tagger, self).__init__()
        self.encoder = encoder
        self.src_embed = src_embed
        self.hidden_to_tag = nn.Linear(hidden_size, nb_tags)
    
    def forward(self, src):
        "Take in and process masked src and target sequences."
        tag_space = self.hidden_to_tag(self.encode(src))
        tag_scores = F.log_softmax(tag_space, dim=1)# TODO: check dim (-1 better?)
        return tag_scores
    
    def encode(self, src):
        return self.encoder(self.src_embed(src))

In [224]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [225]:
def make_model(vocab_size, nb_tags, N=2, 
               d_model=40, d_ff=25, h=4, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = Tagger(Encoder(EncoderLayer(d_model, attn, ff, dropout, h), N),
        nn.Sequential(Embeddings(d_model, vocab_size), position),
        hidden_size=d_model, nb_tags=nb_tags)
    
    # This was important from their codsrc_vocabe. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [226]:
model = make_model(len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

tensor([[ 0.],
        [ 1.],
        [ 2.],
        [ 3.],
        [ 4.],
        [ 5.],
        [ 6.],
        [ 7.],
        [ 8.],
        [ 9.],
        [10.],
        [11.],
        [12.],
        [13.],
        [14.],
        [15.],
        [16.],
        [17.],
        [18.],
        [19.],
        [20.],
        [21.],
        [22.],
        [23.],
        [24.],
        [25.],
        [26.],
        [27.],
        [28.],
        [29.],
        [30.],
        [31.],
        [32.],
        [33.],
        [34.],
        [35.],
        [36.],
        [37.],
        [38.],
        [39.],
        [40.],
        [41.],
        [42.],
        [43.],
        [44.],
        [45.],
        [46.],
        [47.],
        [48.],
        [49.],
        [50.],
        [51.],
        [52.],
        [53.],
        [54.],
        [55.],
        [56.],
        [57.],
        [58.],
        [59.],
        [60.],
        [61.],
        [62.],
        [63.],
        [64.],
        [65.],
        [6

  from ipykernel import kernelapp as app


In [227]:
from tqdm import tqdm_notebook as tqdm


n_total = len(train_dev_sents)
for epoch in range(25):  # again, normally you would NOT do 300 epochs, it is toy data
    accum_loss = 0
    for sentence, tags in tqdm(zip(train_dev_sents, train_dev_labels), total=n_total):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence)#, word_to_ix)
        targets = prepare_sequence(tags)#, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        accum_loss += loss
        loss.backward()
        optimizer.step()
    print(accum_loss / n_total)


A Jupyter Widget

tensor(0.3975, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.1232, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0822, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0614, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0525, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0463, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0389, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0356, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0301, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0268, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0281, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0271, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0215, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0198, grad_fn=<DivBackward0>)


A Jupyter Widget

tensor(0.0189, grad_fn=<DivBackward0>)


A Jupyter Widget

KeyboardInterrupt: 

In [None]:
# After 5 epochs: tensor(0.1251, grad_fn=<DivBackward0>)
# After 25 epochs: tensor(0.0679, grad_fn=<DivBackward0>)

In [228]:
for l in attn_lists_0:
    print(sum(l)/len(l), len(l), sum(l) )
print()
for l in attn_lists_1:
    print(sum(l)/len(l), len(l), sum(l) )
print()
for l in attn_lists_2:
    print(sum(l)/len(l), len(l), sum(l) )
print()
for l in attn_lists_3:
    print(sum(l)/len(l), len(l), sum(l) )
    
print()
for l in zip(attn_lists_0, attn_lists_1, attn_lists_2, attn_lists_3):
    s = (sum(l[0]) + sum(l[1]) + sum(l[2]) + sum(l[3]))/4
    assert len(l[0]) == len(l[1])
    print(s/len(l[1]), len(l[1]), s )

0.09721837500101903 154078 14979.21278340701
0.1144624073281614 154078 17636.138796308453
0.09581338038828995 154078 14762.73402346694
0.11515539229853884 151872 17488.87973916369
0.1059915970283634 149130 15806.526864839834
0.0900451749685293 144498 13011.347692602547
0.09161436097718885 138706 12707.461553701956
0.08242612621788327 128454 10587.965617191978
0.07910792859256256 114408 9050.579894417897
0.07097024992136645 99054 7029.887135711033

0.08764652787088788 154078 13504.401721290662
0.13233445865175836 154078 20389.828720145626
0.09547461510009381 154078 14710.537745392254
0.11966644457067346 151872 18173.98226983732
0.09750429591727496 149130 14540.815650143213
0.09205421277205672 144498 13301.649637136652
0.08291040433140828 138706 11500.170543192316
0.08322518384568715 128454 10690.607765713898
0.07520340516405925 114408 8603.871178009691
0.0739175047087922 99054 7321.824511424702

0.09460808185294364 154078 14577.02403573785
0.14654967788508277 154078 22580.081269177783
0

In [116]:
correct = 0
total = 0
for sentence, tags in zip(test_sents, test_labels):
    sentence_in = prepare_sequence(sentence)#, word_to_ix)
    targets = prepare_sequence(tags)#, tag_to_ix)
    tag_scores = model(sentence_in)
    correct += (tag_scores.argmax(-1) == targets).sum()
    total += len(targets)
    
print(float(correct) / float(total))
    

0.8885627310284845


In [121]:
class BasicTagger(nn.Module):
    def __init__(self, embedding_dim, vocab_size, tagset_size):
        super(BasicTagger, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The linear layer that maps from hidden state space to tag space
        self.embedding2tag = nn.Linear(embedding_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        tag_space = self.embedding2tag(embeds)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

basic_model = BasicTagger(EMBEDDING_DIM, len(word_to_id), len(label_to_id))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(basic_model.parameters(), lr=0.1)

n_total = len(train_dev_sents)
for epoch in range(25):  # again, normally you would NOT do 300 epochs, it is toy data
    accum_loss = 0
    for sentence, tags in tqdm(zip(train_dev_sents, train_dev_labels), total=n_total):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        basic_model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence)#, word_to_ix)
        targets = prepare_sequence(tags)#, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = basic_model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        
        accum_loss += loss
        loss.backward()
        optimizer.step()
    print(accum_loss / n_total)


A Jupyter Widget


tensor(0.9193, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.5314, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.4500, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.4125, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3901, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3748, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3636, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3550, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3482, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3427, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3382, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3344, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3311, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3284, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3260, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3239, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3220, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3204, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3190, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3178, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3166, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3156, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3147, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3139, grad_fn=<DivBackward0>)


A Jupyter Widget


tensor(0.3132, grad_fn=<DivBackward0>)


In [122]:
correct = 0
total = 0
for sentence, tags in zip(test_sents, test_labels):
    sentence_in = prepare_sequence(sentence)#, word_to_ix)
    targets = prepare_sequence(tags)#, tag_to_ix)
    tag_scores = basic_model(sentence_in)
    correct += (tag_scores.argmax(-1) == targets).sum()
    total += len(targets)
    
print(float(correct) / float(total))

0.8413785605566427
