<a href="https://colab.research.google.com/github/keith-leung/cis667/blob/master/BeamLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch as tr

In [None]:
a = tr.tensor([-10, 10, 10])
tr.exp(a) / tr.exp(a).sum()

tensor([1.0306e-09, 5.0000e-01, 5.0000e-01])

In [None]:
sentences = [
  "How are you",
  "Who are you",
  "Who are they",
  "Who are we",
  "Who am I",
  "Who am I",
  "Where are you going"
]

In [None]:
# Make a dictionary mapping each word to a one-hot tensor
words = set()
for sentence in sentences:
  for word in sentence.split(" "):
    words.add(word)
words = tuple(words) # deterministic order

# PyTorch LSTM expects 3d tensors representing (sequence length, batch size, number of features)
I = tr.eye(len(words))
dictionary = {
    word: I[w].reshape(1,1,len(words))
    for w,word in enumerate(words)}

print(dictionary)

{'I': tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]), 'am': tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]]), 'Where': tensor([[[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]]), 'How': tensor([[[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]]]), 'they': tensor([[[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]]), 'going': tensor([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]]), 'Who': tensor([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]]), 'we': tensor([[[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]]), 'are': tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]]), 'you': tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]]])}


In [None]:
# Define a small LSTM recurrent neural network with linear hidden-to-output layer
class Net(tr.nn.Module):
  def __init__(self, hidden_size):
    super(Net, self).__init__()
    self.lstm = tr.nn.LSTM(input_size = len(words), hidden_size = hidden_size)
    self.readout = tr.nn.Linear(in_features=hidden_size, out_features=len(words))
  def forward(self, x, v=None):
    _, v = self.lstm(x) if v is None else self.lstm(x, v) # update hidden from input
    h, c = v # LSTM hidden vector and internal so-called "cell state"
    y = self.readout(h) # get output from hidden
    y = tr.softmax(y, dim=-1) # make sure output is a probability distribution
    return y, v

print(Net(3))

Net(
  (lstm): LSTM(10, 3)
  (readout): Linear(in_features=3, out_features=10, bias=True)
)


In [None]:
net = Net(3)
opt = tr.optim.SGD(net.parameters(), lr=0.01)

for epoch in range(2000):

  batch_loss = 0.

  for sentence in sentences:
    tokens = sentence.split(" ")

    v = None # no hidden activation at first time-step
    for t in range(len(tokens)-1):

      y, v = net(dictionary[tokens[t]], v)
      y_target = dictionary[tokens[t+1]]

      #loss = tr.sum((y - y_target)**2) # MSE
      loss = -tr.sum(y_target * tr.log(y)) # Cross-entropy
      batch_loss += loss

  batch_loss.backward()
  opt.step()
  opt.zero_grad()

  if epoch % 100 == 0: print(epoch, batch_loss.item())

0 36.9460334777832
100 27.079044342041016
200 24.730289459228516
300 20.0532283782959
400 15.06004524230957
500 12.627098083496094
600 11.252155303955078
700 10.191694259643555
800 9.365312576293945
900 8.77193832397461
1000 8.351117134094238
1100 8.044500350952148
1200 7.815389633178711
1300 7.6418280601501465
1400 7.508695125579834
1500 7.40455961227417
1600 7.321336269378662
1700 7.253550052642822
1800 7.197437286376953
1900 7.150343418121338


In [None]:
# Try predicting
word = "Who"
v = None
print(word)

for t in range(3):
  x = dictionary[word]
  y, v = net(dictionary[tokens[t]], v)
  y = y.squeeze() # ignore singleton dimensions for time-step/example
  w = y.argmax()
  word = words[w]
  prob = y[w]
  print(word, prob.item())



Who
are 0.935569703578949
you 0.9646154046058655
going 0.9231652617454529


In [None]:
a=tr.arange(10).reshape((1,1,10)).squeeze()
print(a)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
