In [187]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import random
%matplotlib inline

In [188]:
# Read in all the words
words = open('data/names.txt', 'r').read().splitlines()
print(words[:10])

# Build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)

print(f'Character to index mapping: {itos}')
print(f'Vocabulary size: {vocab_size}')

# Shuffle the words
random.seed(42)
random.shuffle(words)


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Character to index mapping: {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
Vocabulary size: 27


In [190]:
# Encode the words with '.' so the model can learn when a name should end
def encode_words(words):
    encoded = []
    for w in words:
        encoded.extend([stoi[ch] for ch in '.' + w ])
    return encoded

encoded = encode_words(words)
encoded[:20]

[0, 25, 21, 8, 5, 14, 7, 0, 4, 9, 15, 14, 4, 18, 5, 0, 24, 1, 22, 9]

In [191]:
# Parameters for creating training and test sets
n = len(encoded)
n1 = int(0.8 * n)
block_size = 8
batch_size = 1

train_seq = encoded[:n1]
dev_seq = encoded[n1:]

In [192]:
def create_pairs(seq, block_size):
    X, Y = [], []
    for i in range(0, len(seq) - block_size, block_size):
        X.append(seq[i:i+block_size])
        Y.append(seq[i+1:i+block_size+1])
    X = torch.tensor(X, dtype=torch.float32)
    Y = torch.tensor(Y, dtype=torch.float32)
    return X, Y

Xtr, Ytr = create_pairs(train_seq, block_size)
Xdev, Ydev = create_pairs(dev_seq, block_size)

In [193]:
# We are sure that we can pass in previous example's hidden layer
print(Xtr[1])
print(Ytr[1])
print(Xtr[2])
print(Ytr[2])
print(Xtr[3])
print(Ytr[3])
print(f'Training data shapes - X: {Xtr.shape}, Y: {Ytr.shape}')
print(f'Development data shapes - X: {Xdev.shape}, Y: {Ydev.shape}')

tensor([ 4.,  9., 15., 14.,  4., 18.,  5.,  0.])
tensor([ 9., 15., 14.,  4., 18.,  5.,  0., 24.])
tensor([24.,  1., 22.,  9.,  5., 14.,  0., 10.])
tensor([ 1., 22.,  9.,  5., 14.,  0., 10., 15.])
tensor([15., 18.,  9.,  0., 10., 21.,  1., 14.])
tensor([18.,  9.,  0., 10., 21.,  1., 14., 12.])
Training data shapes - X: torch.Size([22814, 8]), Y: torch.Size([22814, 8])
Development data shapes - X: torch.Size([5703, 8]), Y: torch.Size([5703, 8])


In [195]:
lr = 0.01
batch_size = 1
hidden_size = 30
time_steps = 8
input_size = 27

In [199]:
# def softmax(x):
#     e_x = torch.exp(x - torch.max(x, dim=1, keepdim=True)[0])
#     return e_x / torch.sum(e_x, dim=1, keepdim=True)

def softmax(x):
    e_x = torch.exp(x)
    return e_x / torch.sum(e_x, dim=1, keepdim=True)

In [197]:
# Initalize parameters
Fvh = torch.randn(vocab_size, hidden_size) * 0.01
i1vh = torch.randn(vocab_size, hidden_size) * 0.01
i2vh = torch.randn(vocab_size, hidden_size) * 0.01
Ovh = torch.randn(vocab_size, hidden_size) * 0.01

Fhh = torch.randn(hidden_size, hidden_size) * 0.01
i1hh = torch.randn(hidden_size, hidden_size) * 0.01
i2hh = torch.randn(hidden_size, hidden_size) * 0.01
Ohh = torch.randn(hidden_size, hidden_size) * 0.01

bias1 = torch.zeros(1, hidden_size)
bias2 = torch.zeros(1, hidden_size)
bias3 = torch.zeros(1, hidden_size)
bias4 = torch.zeros(1, hidden_size)

output_matrix = torch.randn(hidden_size, vocab_size) * 0.01
output_bias = torch.zeros(1, vocab_size)

X = {}
H = {}
C = {}
Ct = {}
preact1 = {}
preact2 = {}
preact3 = {}
preact4 = {}
act1 = {}
act2 = {}
act3 = {}
act4 = {}
logits = {}

In [201]:
def sample(hprev, cprev, seed_ix, n):
    # Create one hot vector based on input seed_ix
    x = torch.zeros(1, vocab_size)
    x[0, seed_ix] = 1
    ixes = []

    for t in range(n):
        preact1 = x @ Fvh + hprev @ Fhh + bias1 # (1, 27) @ (27, 30) + (1, 30) @ (30, 30) + (30) = (1, 30)
        preact2 = x  @ i1vh + hprev  @ i1hh * bias2 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
        preact3 = x  @ i2vh + hprev  @ i2hh + bias3 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
        preact4 = x  @ Ovh + hprev  @ Ohh + bias4 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
        
        act1 = torch.sigmoid(preact1) # (1, 30)
        act2 = torch.sigmoid(preact2) # (1, 30)
        act3 = torch.tanh(preact3) # (1, 30)
        act4 = torch.sigmoid(preact4) # (1, 30)

        # Forget gate * previous cell state + i1 gate * i2 gate
        C = act1 * cprev  + act2  * act3  # (1, 30)
        Ct = torch.tanh(C) # (1, 30)
        H = Ct * act4  # (1, 30)          

        # Logits
        logits = H  @ output_matrix + output_bias # (1, 30) @ (30, 27) + (1, 27) = (1, 27)

        # Probability
        counts = logits.exp()
        counts_sum = counts.sum(1, keepdims=True)
        counts_sum_inv = counts_sum**-1
        probs = counts * counts_sum_inv

        # Sample from distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        if (ix == 0):
            break
        ixes.append(ix)

        # Update our parameters
        x = torch.zeros(1, vocab_size)
        x[0, ix] = 1
        hprev = H
        cprev = C
    return ixes

In [202]:
max_iterations = 0

while max_iterations < 100000:
    
    # Looping over batches (each batch is just a single row)
    for batch_num in range(Xtr.shape[0]):

        # Initial cell state and hidden state and loss
        if (max_iterations == 0):
            H[-1] = torch.zeros(1, hidden_size)
            C[-1] = torch.zeros(1, hidden_size)
        loss = 0   
        
        # Get a batch of random numbers into correct shape
        # Get corresponding row from training set
        Xb = Xtr[batch_num, :].to(torch.long) # (1, 8)
        # Conver to one hot vectors
        Xb = F.one_hot(Xb, 27).float() # (8, 27)
        # Get corresponding row from training set
        Yb = Ytr[batch_num].to(torch.long) # (8, 1)
        
### --------------------------------------------------------------------------------------------------------------        
        # Forward pass over all time steps
        for t in range(time_steps):
            # Pass through weight matrices (last hidden activations and current time step's input)
            # This means that there is one less hidden layer than the number of activations inputted
            # Xb[t] gets a single row from the training set
            preact1[t] = Xb[t] @ Fvh + H[t-1] @ Fhh + bias1 # (1, 27) @ (27, 30) + (1, 30) @ (30, 30) + (30) = (1, 30)
            preact2[t] = Xb[t]  @ i1vh + H[t-1]  @ i1hh * bias2 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            preact3[t] = Xb[t]  @ i2vh + H[t-1]  @ i2hh + bias3 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            preact4[t] = Xb[t]  @ Ovh + H[t-1]  @ Ohh + bias4 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)

            # Compute activations
            act1[t] = torch.sigmoid(preact1[t]) # (1, 30)
            act2[t] = torch.sigmoid(preact2[t]) # (1, 30)
            act3[t] = torch.tanh(preact3[t]) # (1, 30)
            act4[t] = torch.sigmoid(preact4[t]) # (1, 30)
            
            # Forget gate * previous cell state + i1 gate * i2 gate
            C[t] = act1[t] * C[t-1]  + act2[t]  * act3[t]  # (1, 30)
            Ct[t] = torch.tanh(C[t]) # (1, 30)
            # Compute current time step's hidden output to be passed onto the next time step
            H[t] = Ct[t] * act4[t]  # (1, 30)          

            # Logits
            logits[t] = H[t]  @ output_matrix + output_bias # (1, 30) @ (30, 27) + (1, 27) = (1, 27)

            # Cross entropy
            counts = logits[t].exp()
            counts_sum = counts.sum(1, keepdims=True)
            counts_sum_inv = counts_sum**-1
            probs = counts * counts_sum_inv
            logprobs = probs.log()
            loss += -logprobs[0, Yb[t]]

        # Continue the sequence for time step -1 of the next input sequence by passing it on
        H[-1] = H[time_steps - 1]
        C[-1] = C[time_steps - 1]

        # Print out the loss occasionally, and sample from the model occassionally
        if max_iterations % 1000 == 0:
            print("loss: ", loss)
            print("iteration: ", max_iterations)
            # Pass in the last hidden layer, and last cell state
            # Pass in the index which can be referenced through itos
            sample_ix = sample(H[-1], C[-1], 0, 15)
            txt = ''.join(itos[ix] for ix in sample_ix)
            print('----\n %s \n----' % (txt,))

        # Increment max iterations (each training example means one iteration)
        max_iterations += 1 

### ---------------------------------------------------------------------------------------------------------------
        # Backward pass
        # Set to zeros initially
        dHnext = torch.zeros_like(H[0])
        dCnext = torch.zeros_like(C[0])
        
        dFvh = torch.zeros(vocab_size, hidden_size)
        di1vh = torch.zeros(vocab_size, hidden_size)
        di2vh = torch.zeros(vocab_size, hidden_size)
        dOvh = torch.zeros(vocab_size, hidden_size)
        
        dFhh = torch.zeros(hidden_size, hidden_size)
        di1hh = torch.zeros(hidden_size, hidden_size)
        di2hh = torch.zeros(hidden_size, hidden_size)
        dOhh = torch.zeros(hidden_size, hidden_size)
        
        dbias1 = torch.zeros(1, hidden_size)
        dbias2 = torch.zeros(1, hidden_size)
        dbias3 = torch.zeros(1, hidden_size)
        dbias4 = torch.zeros(1, hidden_size)
        
        doutput_matrix = torch.zeros(hidden_size, vocab_size)
        doutput_bias = torch.zeros(1, vocab_size)

        for t in reversed(range(time_steps)):
            # Backpropogate cross entropy
            dlogits = F.softmax(logits[t], 1)
            dlogits [0, Yb[t]] -= 1
            
            # Backpropogate output matrix and its bias
            doutput_matrix +=  H[t].T @ dlogits # (30, 1) @ (1, 27) = (30, 27)
            doutput_bias += dlogits # (1, 27)
            
            # Backpropogate into H (two derivatives due to 2 dependencies)
            dH = dlogits  @ output_matrix.T + dHnext # (1, 27) @ (27, 30) + (1, 30) = (1, 30)
            
            # Backpropogate dact4 (output gate activations)
            dact4  = dH  * Ct[t]  # (1, 30) * (1, 30) = (1, 30)
            
            # Backpropogate dC (current cell state and also two derivatives)
            dC = dH * act4[t] * (1 - torch.tanh(C[t]) ** 2) + dCnext # (1, 30) * (1, 30) * (1, 30) = (1, 30)
            
            ### Backpropogate act1 and previous cell state
            # Forget gate activations
            # dCnext activations
            dact1 = dC * C[t-1] # (1, 30) * (1, 30) = (1, 30)
            dCnext = dC * act1[t]
            
            # Backpropogate i1 activations
            dact2  = dC  * act3[t] # (1, 27)
            
            # Backpropogate i2 activations
            dact3  = dC  * act2[t]  # (1, 27)
            
            # Backpropogate all preactivations
            dpreact1 = dact1 * act1[t] * (1 - act1[t])
            dpreact2 = dact2 * act2[t] * (1 - act2[t])
            dpreact3 = dact3 * (1 - torch.tanh(preact3[t]) ** 2)
            dpreact4 = dact4 * act4[t] * (1 - act4[t])
            
            # Backpropogate gates
            dFvh += Xb[t].reshape(vocab_size, 1) @ dpreact1  # (27, 1) (1, 30) = (27, 30)
            dFhh += H[t-1].reshape(hidden_size, 1) @ dpreact1  # (30, 1) (1, 30) = (30, 30)
            dbias1 += dpreact1
            
            di1vh += Xb[t].reshape(vocab_size, 1) @ dpreact2 # (27, 1) (1, 30) = (27, 30)
            di1hh += H[t-1].reshape(hidden_size, 1) @ dpreact2 # (30, 1) (1, 30) = (30, 30)
            dbias2 += dpreact2
            
            di2vh += Xb[t].reshape(vocab_size, 1) @ dpreact3 # (27, 1) (1, 30) = (27, 30)
            di2hh += H[t-1].reshape(hidden_size, 1) @ dpreact3 # (30, 1) (1, 30) = (30, 30)
            dbias3 += dpreact3
            
            dOvh += Xb[t].reshape(vocab_size, 1) @ dpreact4 # (27, 1) (1, 30) = (27, 30)
            dOhh += H[t-1].reshape(hidden_size, 1) @ dpreact4 # (30, 1) (1, 30) = (30, 30)
            dbias4 += dpreact4
            
            # Backpropogate dHnext
            # (1, 30) @ (30, 30) ...  = (1, 30)
            dHnext  = dpreact1  @ Fhh.T + dpreact2  @ i1hh.T + dpreact3  @ i2hh.T + dpreact4  @ Ohh.T

        # torch.nn.utils.clip_grad_norm_([dFvh, di1vh, di2vh, dOvh, dFhh, di1hh, di2hh, dOhh, dbias1, dbias2, dbias3, dbias4, doutput_matrix, doutput_bias], max_norm=1.0)

### ------------------------------------------------------------------------------------------------------------------
    
        # Update parameters using gradients
        Fvh -= lr * dFvh
        i1vh -= lr * di1vh
        i2vh -= lr * di2vh
        Ovh -= lr * dOvh

        Fhh -= lr * dFhh
        i1hh -= lr * di1hh
        i2hh -= lr * di2hh
        Ohh -= lr * dOhh
        
        bias1 -= lr * dbias1
        bias2 -= lr * dbias2
        bias3 -= lr * dbias3
        bias4 -= lr * dbias4

        output_matrix -= lr * doutput_matrix
        output_bias -= lr * doutput_bias
    
    if (max_iterations == 10000):
        break

    

loss:  tensor(25.4559)
iteration:  0
----
 dee 
----
loss:  tensor(21.4874)
iteration:  1000
----
 dlsc 
----
loss:  tensor(23.5546)
iteration:  2000
----
 jrerwni 
----
loss:  tensor(20.9428)
iteration:  3000
----
 palmy 
----
loss:  tensor(19.0532)
iteration:  4000
----
 jqanen 
----
loss:  tensor(26.2684)
iteration:  5000
----
 npihany 
----
loss:  tensor(22.2071)
iteration:  6000
----
 grytenn 
----
loss:  tensor(19.8890)
iteration:  7000
----
 rasie 
----
loss:  tensor(15.2436)
iteration:  8000
----
 avabcl 
----
loss:  tensor(14.7985)
iteration:  9000
----
 jlzen 
----
loss:  tensor(19.4003)
iteration:  10000
----
 heza 
----
loss:  tensor(16.3885)
iteration:  11000
----
 slhyanni 
----
loss:  tensor(16.7889)
iteration:  12000
----
 cetmay 
----
loss:  tensor(23.2901)
iteration:  13000
----
 zalria 
----
loss:  tensor(20.9950)
iteration:  14000
----
 nenla 
----
loss:  tensor(21.3691)
iteration:  15000
----
 tittha 
----
loss:  tensor(21.7759)
iteration:  16000
----
 corakdo 
---

KeyboardInterrupt: 

In [221]:
output_matrix

tensor([[ 6.9731e-01,  4.2908e-01,  7.9749e-02, -3.1476e-01, -1.8931e-01,
          5.4785e-01, -2.9040e-02, -2.6152e-01, -1.5964e-01, -2.6184e-01,
          1.2604e-01, -2.7313e-01, -5.3776e-02, -2.8828e-01,  3.6541e-01,
          2.2846e-01, -8.3515e-02, -8.4945e-02,  6.6916e-02, -4.5706e-02,
          1.5521e-02,  3.3268e-01, -2.6228e-01,  4.9539e-02, -3.2127e-01,
         -2.4002e-01, -2.2564e-02],
        [-1.4182e+00, -5.2848e-01,  2.1544e-01,  5.7761e-02, -2.8968e-01,
         -4.4298e-01,  2.2780e-01,  1.0227e+00,  1.0928e-03, -8.8566e-01,
          2.2792e-01,  3.1789e-01,  2.5596e-01,  5.6435e-01, -1.1300e+00,
          1.4769e-01,  4.3240e-01,  9.0580e-02,  1.0102e-01,  9.2700e-01,
          6.0826e-01, -7.2635e-01,  1.1025e-01,  2.6847e-01,  1.0599e-01,
         -3.7137e-01,  1.8932e-01],
        [ 5.7060e-01, -1.7362e+00, -2.6351e-01,  1.7233e-01,  5.5306e-01,
         -3.1490e-01,  8.1408e-02,  7.1444e-01, -5.3695e-01, -4.7164e-01,
         -1.9672e-01, -5.0022e-02,  7.41

In [220]:
hprev = torch.zeros(1, hidden_size)
cprev = torch.zeros(1, hidden_size)
num_samples = 10
for i in range(num_samples):
    sample_ix = sample(hprev, cprev, 0, 15)
    txt = ''.join(itos[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))


----
 xore 
----
----
 barin 
----
----
 pighaneyaci 
----
----
 anvyn 
----
----
 wahiah 
----
----
 vira 
----
----
 brynste 
----
----
 kars 
----
----
 shai 
----
----
 keadylo 
----
