In [240]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [184]:
# read in all the words
words = open('data/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])
# build the vocabulary of characters and mappings to/from integers


chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(f'Character to index mapping: {itos}')
print(f'Vocabulary size: {vocab_size}')

# Shuffle the words
random.seed(42)
random.shuffle(words)


32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
Character to index mapping: {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
Vocabulary size: 27


In [245]:
# Function to encode words into a single continuous sequence of indices
def encode_words(words):
    encoded = []
    for w in words:
        encoded.extend([stoi[ch] for ch in '.' + w + '.'])
    return encoded

# Encode all words into a single sequence
encoded = encode_words(words)

# Function to create input-output pairs from a sequence
def create_pairs(seq, block_size):
    X, Y = [], []
    for i in range(0, len(seq) - block_size):
        X.append(seq[i:i+block_size])
        Y.append(seq[i+1:i+block_size+1])
    X = torch.tensor(X, dtype=torch.long)
    Y = torch.tensor(Y, dtype=torch.long)
    return X, Y

# Split into training and development sets
n = len(encoded)
n1 = int(0.8 * n)

train_seq = encoded[:n1]
dev_seq = encoded[n1:]

block_size = 8

# Create pairs for training and development sets
Xtr, Ytr = create_pairs(train_seq, block_size)
Xdev, Ydev = create_pairs(dev_seq, block_size)

print(f'Training data shapes - X: {Xtr.shape}, Y: {Ytr.shape}')
print(f'Development data shapes - X: {Xdev.shape}, Y: {Ydev.shape}')

# Function to split data into batches of shape (batch_size, time_steps, -1)
def split_into_batches(X, Y, batch_size):
    # Ensure the data size is a multiple of batch_size
    num_batches = X.size(0) // batch_size
    X = X[:num_batches * batch_size]
    Y = Y[:num_batches * batch_size]
    
    # Reshape into batches
    X = X.view(batch_size, -1, X.size(-1))
    Y = Y.view(batch_size, -1, Y.size(-1))
    return X, Y

# Example output
print(f'Training batches shape - X: {Xtr.shape}, Y: {Ytr.shape}')
print(f'Development batches shape - X: {Xdev.shape}, Y: {Ydev.shape}')


Training data shapes - X: torch.Size([208135, 8]), Y: torch.Size([208135, 8])
Development data shapes - X: torch.Size([52028, 8]), Y: torch.Size([52028, 8])
Training batches shape - X: torch.Size([208135, 8]), Y: torch.Size([208135, 8])
Development batches shape - X: torch.Size([52028, 8]), Y: torch.Size([52028, 8])


In [246]:
print(Xtr[2])
print(Ytr[2])

tensor([21,  8,  5, 14,  7,  0,  0,  4])
tensor([ 8,  5, 14,  7,  0,  0,  4,  9])


In [247]:
one_hot_Xtr = F.one_hot(Xtr, 27)
one_hot_Xtr = one_hot.view(8, -1, 27)

In [248]:
Xb.shape

torch.Size([8, 32, 27])

In [249]:
Yb.shape

torch.Size([8, 32])

In [251]:
batch_size = 32
batch = torch.randint(0, one_hot_Xtr.shape[1], (batch_size,))
Xb = one_hot_Xtr[:, batch, :] # (8, 32, 27)
Yb = Ytr[batch]
Yb = Yb.view(8, -1)
time_steps, batch_size, input_size = Xb.shape
hidden_size = 30

torch.Size([8, 32, 27])

In [259]:
# for t in time_steps:

Xb = Xb.type(torch.float32)
Yb = Yb.type(torch.int64)

hidden = 30

# Parameters
WLSTM1 = torch.randn(vocab_size, hidden_size)
WLSTM2 = torch.randn(hidden_size, hidden_size)
F = torch.randn(batch_size, hidden_size)
i1 = torch.randn(batch_size, hidden_size)
i2 = torch.randn(batch_size, hidden_size)
O = torch.randn(batch_size, hidden_size)
bias1 = torch.zeros(hidden_size)
bias2 = torch.zeros(hidden_size)
bias3 = torch.zeros(hidden_size)
bias4 = torch.zeros(hidden_size)
output_matrix = torch.randn(hidden_size, vocab_size)

# Storage
# prev = torch.zeros(time_steps, batch_size, hidden_size)
hidden1 = torch.zeros(time_steps, batch_size, hidden_size)
hidden2 = torch.zeros(time_steps, batch_size, hidden_size)
total = torch.zeros(time_steps, batch_size, hidden_size)
preact1 = torch.zeros(time_steps, batch_size, hidden_size)
preact2 = torch.zeros(time_steps, batch_size, hidden_size)
preact3 = torch.zeros(time_steps, batch_size, hidden_size)
preact4 = torch.zeros(time_steps, batch_size, hidden_size)
act1 = torch.zeros(time_steps, batch_size, hidden_size)
act2 = torch.zeros(time_steps, batch_size, hidden_size)
act3 = torch.zeros(time_steps, batch_size, hidden_size)
act4 = torch.zeros((time_steps, batch_size, hidden_size))
C = torch.zeros((time_steps, batch_size, hidden_size))
Ct = torch.zeros((time_steps, batch_size, hidden_size))
Hout = torch.zeros((time_steps, batch_size, hidden_size))
output = torch.zeros((time_steps, batch_size, vocab_size))
c0 = torch.zeros(batch_size, hidden_size)
h0 = torch.zeros((batch_size, hidden_size))


In [262]:
Yb[0]

tensor([ 0,  0,  1, 12,  9,  3,  9, 15, 19,  1,  2, 18,  9, 14,  1,  0, 15, 18,
         9,  1, 14, 14,  1,  0, 21, 18,  0,  0,  2, 18,  1, 25])

In [269]:
logprobs[0].shape

torch.Size([32, 27])

In [299]:
# Forward pass
loss = 0
for t in range(time_steps):
    prevh = H[t-1] if t > 0 else h0
    prevc = C[t-1] if t > 0 else c0
    # 32 examples of 27 one hot encoded words
    hidden1[t] = Xb[t] @ WLSTM1 # #(32, 27) @ (27, 30) = (32, 30)
    hidden2[t] = prevh @ WLSTM2 # (32, 30)
    total[t] = hidden1[t] + hidden2[t] # (32, 30)
    
    preact1[t] = total[t] * F + bias1 # (32, 30)
    preact2[t] = total[t] * i1 + bias2 # (32, 30)
    preact3[t] = total[t] * i2 + bias3 # (32, 30)
    preact4[t] = total[t] * O + bias4 # (32, 30)
    
    act1[t] = torch.sigmoid(preact1[t]) # (32, 30)
    act2[t] = torch.sigmoid(preact2[t]) # (32, 30)
    act3[t] = torch.sigmoid(preact3[t]) # (32, 30)
    act4[t] = torch.tanh(preact4[t]) # (32, 30)
    
    C[t] = act2[t] * prevc + act1[t] * act4[t] # (32, 30)
    Ct[t] = torch.tanh(C[t]) # (32, 30)
    Hout[t] = Ct[t] * act3[t] # (32, 30)
    output[t] = Hout[t] @ output_matrix # (32, 27)
    counts = output.exp()
    counts_sum = counts.sum(1, keepdims=True)
    counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
    probs = counts * counts_sum_inv
    logprobs = probs.log()
    loss += -logprobs[t][torch.arange(32), Yb]

torch.Size([8, 32])

In [293]:
Yb[0]

tensor([ 0,  0,  1, 12,  9,  3,  9, 15, 19,  1,  2, 18,  9, 14,  1,  0, 15, 18,
         9,  1, 14, 14,  1,  0, 21, 18,  0,  0,  2, 18,  1, 25])

torch.Size([8, 32, 27])

In [280]:
Yb[0].shape

torch.Size([32])

In [None]:
# Backward pass
dWLSTM1 = torch.zeros(vocab_size, hidden_size)
dWLSTM2 = torch.zeros(hidden_size, hidden_size)
dF = torch.zeros(batch_size, hidden_size)
di1 = torch.zeros(batch_size, hidden_size)
di2 = torch.zeros(batch_size, hidden_size)
dO = torch.zeros(batch_size, hidden_size)
dbias1 = torch.zeros(hidden_size)
dbias2 = torch.zeros(hidden_size)
dbias3 = torch.zeros(hidden_size)
dbias4 = torch.zeros(hidden_size)
doutput_matrix = torch.randn(hidden_size)

for t in reversed(range(x)):
    
    