# **Sequence Models and LSTMs**


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1) 


<torch._C.Generator at 0x7fafc5165c70>

In [47]:
lstm = nn.LSTM(3,3) #input dim =3 , op dim = 3
inputs = [torch.randn(1,3) for _ in range(5)]  #Making a sequence length of 5

# initialize the hidden state (2 states).
hidden = (torch.randn(1, 1, 3),  #h_0
          torch.randn(1, 1, 3))  #c_0

#above line is same as 
# h0 = torch.randn(1, 1, 3)
# c0 = torch.randn(1, 1, 3)

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    output, hidden = lstm(i.view(1,1,-1), hidden)

print(output)
print(hidden)


tensor([[[-0.0628, -0.0462, -0.1530]]], grad_fn=<StackBackward>)
(tensor([[[-0.0628, -0.0462, -0.1530]]], grad_fn=<StackBackward>), tensor([[[-0.4270, -0.0542, -0.4110]]], grad_fn=<StackBackward>))


In [42]:
torch.cat(inputs).view(len(inputs),1,-1)


tensor([[[-0.5525,  0.6355, -0.3968]],

        [[-0.6571, -1.6428,  0.9803]],

        [[-0.0421, -0.8206,  0.3133]],

        [[-1.1352,  0.3773, -0.2824]],

        [[-2.5667, -1.4303,  0.5009]]])

https://stackoverflow.com/questions/48302810/whats-the-difference-between-hidden-and-output-in-pytorch-lstm/48305882#48305882

# **LSTM for Part-of-Speech Tagging**

In [41]:
#Prepare Data

def prepare_sequence(seq,to_ix):
  idxs = [to_ix[w] for w in seq]
  return torch.tensor(idxs, dtype = torch.long)

training_data = [
    ("The dog ate the apple".split(), ["DET","NN","V","DET","NN"]),
    ("Everybody read that book.".split(), ["NN","V", "DET", "NN"])
]

word_to_ix = {}
for sent, tags in training_data:
  for word in sent:
    if word not in word_to_ix:
      word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

tag_to_ix = {"DET" : 0, "NN" : 1, "V" : 2}
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book.': 8}


In [0]:
#Creating the model

class LSTMTagger(nn.Module):
  def __init__(self,vocab_size, hidden_dim, embed_dim, tagset_size):
    super(LSTMTagger,self).__init__()
    self.hidden_dim = hidden_dim

    self.word_embeddings = nn.Embedding(vocab_size,embed_dim)
    
    # The LSTM takes word embeddings as inputs, and outputs hidden states with dimensionality hidden_dim.
    self.lstm = nn.LSTM(embed_dim, hidden_dim)
    
    # The linear layer that maps from hidden state space to tag space
    self.hidden2tag = nn.Linear(hidden_dim,tagset_size)
    
  def forward(self,sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence),1,-1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence),-1))
    tag_scores = F.log_softmax(tag_space,dim=1)
    return tag_scores

In [46]:
model = LSTMTagger(len(word_to_ix), HIDDEN_DIM, EMBEDDING_DIM, len(tag_to_ix))
lossfn = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(),lr=0.1)

# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    
for epoch in range(300):
  for sentence, tag in training_data:
    
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    model.zero_grad()

    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(sentence, word_to_ix)
    targets = prepare_sequence(tag, tag_to_ix)
     
    # Step 3 : Run forward pass
    tag_preds = model(sentence_in)
    
    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    loss = lossfn(tag_preds,targets)
    loss.backward()
    optimizer.step()
    
# See what the scores are after training
with torch.no_grad():
  inputs = prepare_sequence(training_data[1][0], word_to_ix)
  score = model(inputs)
  print(score)

    

tensor([[-5.3507, -0.0308, -3.6660],
        [-2.9660, -3.4961, -0.0854],
        [-0.0702, -5.2708, -2.7696],
        [-5.3992, -0.0091, -5.3926]])
