In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [4]:
# read in all the words
words = open('data/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)
import random
random.seed(42)
random.shuffle(words)


32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [5]:
# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182580, 8]) torch.Size([182580])
torch.Size([22767, 8]) torch.Size([22767])
torch.Size([22799, 8]) torch.Size([22799])


In [6]:
print(itos[0])

.


In [7]:
print(Xtr[3])

tensor([ 0,  0,  0,  0,  0,  5,  2, 18])


In [24]:
one_hot_Xtr = F.one_hot(Xtr, 27)
one_hot_Xtr = one_hot.view(8, -1, 27)

torch.Size([182580, 8, 27])

In [28]:
batch_size = 32
batch = torch.randint(0, one_hot_Xtr.shape[1], (batch_size,))
Xb = one_hot_Xtr[:, batch, :]
time_steps, batch_size, input_size = Xb.shape
hidden_size = 30

In [116]:
# for t in time_steps:

input = Xb[0].type(torch.float32)
hidden = 30

# Parameters
WLSTM1 = torch.randn(vocab_size, hidden_size)
WLSTM2 = torch.randn(hidden_size, hidden_size)
F = torch.randn(batch_size, hidden_size)
i1 = torch.randn(batch_size, hidden_size)
i2 = torch.randn(batch_size, hidden_size)
O = torch.randn(batch_size, hidden_size)
bias1 = torch.zeros(hidden_size)
bias2 = torch.zeros(hidden_size)
bias3 = torch.zeros(hidden_size)
bias4 = torch.zeros(hidden_size)
output_matrix = torch.randn(hidden_size)

# Storage
# prev = torch.zeros(time_steps, batch_size, hidden_size)
hidden1 = torch.zeros(time_steps, batch_size, hidden_size)
hidden2 = torch.zeros(time_steps, batch_size, hidden_size)
total = torch.zeros(time_steps, batch_size, hidden_size)
preact1 = torch.zeros(time_steps, batch_size, hidden_size)
preact2 = torch.zeros(time_steps, batch_size, hidden_size)
preact3 = torch.zeros(time_steps, batch_size, hidden_size)
preact4 = torch.zeros(time_steps, batch_size, hidden_size)
act1 = torch.zeros(time_steps, batch_size, hidden_size)
act2 = torch.zeros(time_steps, batch_size, hidden_size)
act3 = torch.zeros(time_steps, batch_size, hidden_size)
act4 = torch.zeros((time_steps, batch_size, hidden_size))
C = torch.zeros((time_steps, batch_size, hidden_size))
Ct = torch.zeros((time_steps, batch_size, hidden_size))
Hout = torch.zeros((time_steps, batch_size, hidden_size))

c0 = torch.zeros(batch_size, hidden_size)
h0 = torch.zeros((batch_size, hidden_size))


In [89]:
Hout.shape

torch.Size([8, 32, 30])

In [90]:
prevh = H[t-1]

In [91]:
print(hidden1[0].shape)

torch.Size([32, 30])


In [92]:
print(input.shape)

torch.Size([32, 27])


In [99]:
hidden2[t].shape

torch.Size([32, 30])

In [100]:
prevh.shape

torch.Size([32, 30])

In [101]:
WLSTM2.shape

torch.Size([30, 30])

In [110]:
input.shape

torch.Size([32, 27])

In [127]:
hidden2[2].shape

torch.Size([32, 30])

In [128]:
hidden1[2].shape

torch.Size([32, 30])

In [129]:
total[t].shape

torch.Size([32, 30])

In [135]:
total[0].shape

torch.Size([32, 30])

In [142]:
# Forward pass
for t in range(time_steps):
    prevh = H[t-1] if t > 0 else h0
    prevc = C[t-1] if t > 0 else c0
    
    hidden1[t] = input @ WLSTM1 # (32, 30)
    hidden2[t] = prevh @ WLSTM2 # (32, 30)
    total[t] = hidden1[t] + hidden2[t] # (32, 30)
    
    preact1[t] = total[t] * F + bias1 # (32, 30)
    preact2[t] = total[t] * i1 + bias2 # (32, 30)
    preact3[t] = total[t] * i2 + bias3 # (32, 30)
    preact4[t] = total[t] * O + bias4 # (32, 30)
    
    act1[t] = torch.sigmoid(preact1[t]) # (32, 30)
    act2[t] = torch.sigmoid(preact2[t]) # (32, 30)
    act3[t] = torch.sigmoid(preact3[t]) # (32, 30)
    act4[t] = torch.tanh(preact4[t]) # (32, 30)
    
    C[t] = act2[t] * prevc + act1[t] * act4[t] # (32, 30)
    Ct[t] = torch.tanh(C[t]) # (32, 30)
    Hout[t] = Ct[t] * act3[t] # (32, 30)

In [None]:
# Backward pass
dWLSTM1 = torch.zeros(vocab_size, hidden_size)
dWLSTM2 = torch.zeros(hidden_size, hidden_size)
dF = torch.zeros(batch_size, hidden_size)
di1 = torch.zeros(batch_size, hidden_size)
di2 = torch.zeros(batch_size, hidden_size)
dO = torch.zeros(batch_size, hidden_size)
dbias1 = torch.zeros(hidden_size)
dbias2 = torch.zeros(hidden_size)
dbias3 = torch.zeros(hidden_size)
dbias4 = torch.zeros(hidden_size)
doutput_matrix = torch.randn(hidden_size)

for t in reversed(range(x)):
    
    