In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import random
%matplotlib inline

In [2]:
# read in all the words
words = open('data/names.txt', 'r').read().splitlines()
print(words[:10])

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)

print(f'Character to index mapping: {itos}')
print(f'Vocabulary size: {vocab_size}')

# Shuffle the words
random.seed(42)
random.shuffle(words)


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Character to index mapping: {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
Vocabulary size: 27


In [3]:
def encode_words(words):
    encoded = []
    for w in words:
        encoded.extend([stoi[ch] for ch in '.' + w  + '.'])
    return encoded

encoded = encode_words(words)

In [4]:
encoded[:20]

[0, 25, 21, 8, 5, 14, 7, 0, 0, 4, 9, 15, 14, 4, 18, 5, 0, 0, 24, 1]

In [5]:
n = len(encoded)
n1 = int(0.8 * n)
block_size = 8
batch_size = 32

train_seq = encoded[:n1]
dev_seq = encoded[n1:]

In [6]:
def create_pairs(seq, block_size):
    X, Y = [], []
    for i in range(0, len(seq) - block_size):
        X.append(seq[i:i+block_size])
        Y.append(seq[i+1:i+block_size+1])
    X = torch.tensor(X, dtype=torch.float32)
    Y = torch.tensor(Y, dtype=torch.float32)
    return X, Y

Xtr, Ytr = create_pairs(train_seq, block_size)
Xdev, Ydev = create_pairs(dev_seq, block_size)

In [7]:
print(Xtr[1])
print(Ytr[1])
print(f'Training data shapes - X: {Xtr.shape}, Y: {Ytr.shape}')
print(f'Development data shapes - X: {Xdev.shape}, Y: {Ydev.shape}')

tensor([25., 21.,  8.,  5., 14.,  7.,  0.,  0.])
tensor([21.,  8.,  5., 14.,  7.,  0.,  0.,  4.])
Training data shapes - X: torch.Size([208135, 8]), Y: torch.Size([208135, 8])
Development data shapes - X: torch.Size([52028, 8]), Y: torch.Size([52028, 8])


In [8]:
def split_into_batches(X, Y, batch_size, time_steps):
    num_batches = X.size(0) // batch_size
    
    # Adjust the number of examples to be divisible by batch_size * time_steps
    num_examples = num_batches * batch_size
    X = X[:num_examples, :]
    Y = Y[:num_examples, :]

    # Reshape X and Y to have dimensions: (time_steps, num_batches, batch_size)
    X = X.view(time_steps, num_batches, batch_size).permute(0, 1, 2)
    Y = Y.view(time_steps, num_batches, batch_size).permute(0, 1, 2)

    return X, Y


Xtr_batched, Ytr_batched = split_into_batches(Xtr, Ytr, 32, 8)
Xdev_batched, Ydev_batched = split_into_batches(Xdev, Ydev, 32, 8)


print(f'Training batches shape - X: {Xtr_batched.shape}, Y: {Ytr_batched.shape}')
print(f'Development batches shape - X: {Xdev_batched.shape}, Y: {Ydev_batched.shape}')

# (time_steps, num_batches, batch_size)

# print(Xtr_batched.shape)

# print(Xtr_batched[:, 1, 0:10])
# print("\n")
# print(Ytr_batched[:, 1, 0:10])
# print("\n")

# print("next batch")
# print(Xtr_batched[:, 0, 26:32])
# print("\n")
# print(Ytr_batched[:, 0, 26:32])



# (8, 1, 32) -> (8, 32)

Training batches shape - X: torch.Size([8, 6504, 32]), Y: torch.Size([8, 6504, 32])
Development batches shape - X: torch.Size([8, 1625, 32]), Y: torch.Size([8, 1625, 32])


In [9]:
batch_size = 32
hidden_size = 30
time_steps = 8
input_size = 27

# Parameters
Fvh = torch.randn(vocab_size, hidden_size)
i1vh = torch.randn(vocab_size, hidden_size)
i2vh = torch.randn(vocab_size, hidden_size)
Ovh = torch.randn(vocab_size, hidden_size)

Fhh = torch.randn(hidden_size, hidden_size)
i1hh = torch.randn(hidden_size, hidden_size)
i2hh = torch.randn(hidden_size, hidden_size)
Ohh = torch.randn(hidden_size, hidden_size)

bias1 = torch.zeros(hidden_size)
bias2 = torch.zeros(hidden_size)
bias3 = torch.zeros(hidden_size)
bias4 = torch.zeros(hidden_size)

output_matrix = torch.randn(hidden_size, vocab_size)

# Storage
preact1 = torch.zeros(time_steps, batch_size, hidden_size)
preact2 = torch.zeros(time_steps, batch_size, hidden_size)
preact3 = torch.zeros(time_steps, batch_size, hidden_size)
preact4 = torch.zeros(time_steps, batch_size, hidden_size)

act1 = torch.zeros(time_steps, batch_size, hidden_size)
act2 = torch.zeros(time_steps, batch_size, hidden_size)
act3 = torch.zeros(time_steps, batch_size, hidden_size)
act4 = torch.zeros((time_steps, batch_size, hidden_size))

Cin = torch.zeros((time_steps, batch_size, hidden_size))
Cout = torch.zeros((time_steps, batch_size, hidden_size))
Ctout = torch.zeros((time_steps, batch_size, hidden_size))

Hin = torch.zeros((time_steps, batch_size, hidden_size))
Hout = torch.zeros((time_steps, batch_size, hidden_size))

logits = torch.zeros((time_steps, batch_size, vocab_size))

c0 = torch.zeros(batch_size, hidden_size)
h0 = torch.zeros((batch_size, hidden_size))

# Backward pass

# To update
dFvh = torch.zeros(vocab_size, hidden_size)
di1vh = torch.zeros(vocab_size, hidden_size)
di2vh = torch.zeros(vocab_size, hidden_size)
dOvh = torch.zeros(vocab_size, hidden_size) 

dFhh = torch.zeros(hidden_size, hidden_size)
di1hh = torch.zeros(hidden_size, hidden_size)
di2hh = torch.zeros(hidden_size, hidden_size)
dOhh = torch.zeros(hidden_size, hidden_size)

dbias1 = torch.zeros(hidden_size) # (30)
dbias2 = torch.zeros(hidden_size) # (30)
dbias3 = torch.zeros(hidden_size) # (30)
dbias4 = torch.zeros(hidden_size) # (30)

doutput_matrix = torch.randn(hidden_size, vocab_size) # (30, 27)

# Placeholders (indexed with t)
dlogits = torch.zeros((time_steps, batch_size, vocab_size)) # (30, 27)
dhidden1 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dhidden2 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dtotal = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)

dpreact1 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30) 
dpreact2 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dpreact3 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dpreact4 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)

dact1 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30) 
dact2 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dact3 = torch.zeros(time_steps, batch_size, hidden_size) # (32, 30)
dact4 = torch.zeros((time_steps, batch_size, hidden_size)) # (32, 30)

dC = torch.zeros((time_steps, batch_size, hidden_size)) # (32, 30)
dCt = torch.zeros((time_steps, batch_size, hidden_size)) # (32, 30)
dHin = torch.zeros((time_steps, batch_size, hidden_size)) # (32, 30)
dHout = torch.zeros((time_steps, batch_size, hidden_size)) # (32, 30)
dlogits = torch.zeros((time_steps, batch_size, vocab_size)) # (32, 27)

dc0 = torch.zeros(batch_size, hidden_size)
dh0 = torch.zeros((batch_size, hidden_size))
dcn = torch.zeros(batch_size, hidden_size)
dhn = torch.zeros((batch_size, hidden_size))

In [16]:
parameters = [
    Fvh, i1vh, i2vh, Ovh,
    Fhh, i1hh, i2hh, Ohh,
    bias1, bias2, bias3, bias4,
    output_matrix,
]

# Set requires_grad to True for all parameters
for p in parameters:
    p.requires_grad_(True)
    torch.autograd.set_detect_anomaly(True)

# Print the total number of parameters
print("Total number of parameters:", sum(p.numel() for p in parameters))

Total number of parameters: 7770


In [17]:
lr = 0.8

In [19]:
while True:
    # Start with no cell activations, and no hidden activations
    h0 = torch.zeros((batch_size, hidden_size))
    c0 = torch.zeros((batch_size, hidden_size))
    
    # Loop over all batches (6504 total)
    for batch_num in reversed(range(Xtr.shape[1])):
        # Get a batch of random numbers into correct shape
        Xb = Xtr_batched[:, batch_num, :] # ( 8, 32)
        Xb = Xb.to(torch.long)
        Xb = F.one_hot(Xb, 27) # (8, 32, 27)
        Xb = Xb / 1.0
        Yb = Ytr_batched[batch_num]
        Yb = Yb.to(torch.long)
    
        # Forward propagation
        for t in range(time_steps):
            if t == 0:
                Hin[t] = h0
                Cin[t] = c0
            else:
                Hin[t] = Hout[t-1]
                Cin[t] = Cout[t-1]
            loss = 0
            
            preact1[t] = Xb[t] @ Fvh + Hin[t] @ Fhh + bias1 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            preact2[t] = Xb[t] @ i1vh + Hin[t] @ i1hh * bias2 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            preact3[t] = Xb[t] @ i2vh + Hin[t] @ i2hh + bias3 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            preact4[t] = Xb[t] @ Ovh + Hin[t] @ Ohh + bias4 # (32, 27) @ (27, 30) + (32, 30) @ (30, 30) + (30)
            
            act1[t] = torch.sigmoid(preact1[t]) # (32, 30)
            act2[t] = torch.sigmoid(preact2[t]) # (32, 30)
            act3[t] = torch.tanh(preact3[t]) # (32, 30)
            act4[t] = torch.sigmoid(preact4[t]) # (32, 30)
            
            Cout[t] = act1[t] * Cin[t] + act2[t] * act3[t] # (32, 30)
            if t < time_steps -1: Cin[t+1] = Cout[t]
            Ctout[t] = torch.tanh(Cout[t]) # (32, 30)
            Hout[t] = Ctout[t] * act4[t] # (32, 30)
            if t < time_steps -1: Hin[t+1] = Hout[t] 
            
            
            logits[t] = Hout[t] @ output_matrix # (32, 27)
            counts = logits.exp()
            counts_sum = counts.sum(1, keepdims=True)
            counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
            probs = counts * counts_sum_inv
            logprobs = probs.log()
            loss += -logprobs[t][torch.arange(32), Yb].mean()
    
        if (batch_num % 100 == 0):
            print (loss / time_steps)
    
        h0 = Hout[0]
        c0 = Cout[0]
        
        # Backward pass
        for p in parameters:
            p.grad = None
        loss.backward()
        # Update parameters using gradients
        for p, grad in zip(parameters, grads):
            # p.data += -lr * p.grad # old way of cheems doge (using PyTorch grad from .backward())
            p.data += -lr * grad # new way of swole doge TODO: enable
    

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 30]], which is output 0 of AsStridedBackward0, is at version 32; expected version 31 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [272]:
import torch

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def softmax(x):
    e_x = torch.exp(x - torch.max(x, dim=1, keepdim=True)[0])
    return e_x / torch.sum(e_x, dim=1, keepdim=True)

def sample_model(start_vector, Fvh, Fhh, i1vh, i1hh, i2vh, i2hh, Ovh, Ohh, bias1, bias2, bias3, bias4, output_matrix):
    """
    Generate a sequence of samples from the model.
    
    Parameters:
    - start_vector: Initial one-hot encoded vector to start the generation (torch tensor of shape (1, input_size))
    - Other parameters: Model weights and biases
    
    Returns:
    - Generated sequence (list of integers)
    """
    input_size = start_vector.shape[1]
    hidden_size = Fvh.shape[1]
    
    # Initialize hidden and cell states
    h = torch.zeros((1, hidden_size))
    c = torch.zeros((1, hidden_size))
    
    # Initialize the generated sequence with the index of the start vector
    generated_sequence = [torch.argmax(start_vector).item()]
    
    # Loop until '.' is generated or the maximum sequence length is reached
    while generated_sequence[-1] != 0 and len(generated_sequence) <= max_length:
        x = start_vector  # Use the provided start vector as input
        
        preact1 = x @ Fvh + h @ Fhh + bias1
        preact2 = x @ i1vh + h @ i1hh + bias2
        preact3 = x @ i2vh + h @ i2hh + bias3
        preact4 = x @ Ovh + h @ Ohh + bias4
        
        act1 = torch.sigmoid(preact1)
        act2 = torch.sigmoid(preact2)
        act3 = torch.tanh(preact3)
        act4 = torch.sigmoid(preact4)
        
        c = act1 * c + act2 * act3
        h = torch.tanh(c) * act4
        
        logits = h @ output_matrix
        probs = softmax(logits)
        
        # Sample from the probability distribution to get the next input
        next_index = torch.multinomial(probs.squeeze(), 1).item()
        
        generated_sequence.append(next_index)
        
        # Update the start vector with the one-hot encoded representation of the next index
        start_vector = torch.zeros((1, input_size))
        start_vector[0, next_index] = 1
        
    return [itos[ch] for ch in generated_sequence]

# Example usage
input_size = 27  # Size of the one-hot encoded vector
start_index = 1  # Index representing the starting point (e.g., 'a')
start_vector = torch.zeros((1, input_size))
start_vector[0, start_index] = 1  # One-hot encode the starting point

# Assume Fvh, Fhh, i1vh, i1hh, i2vh, i2hh, Ovh, Ohh, bias1, bias2, bias3, bias4, and output_matrix are already defined

max_length = 100  # Maximum length of the generated sequence


In [288]:
generated_sequence = sample_model(start_vector, Fvh, Fhh, i1vh, i1hh, i2vh, i2hh, Ovh, Ohh, bias1, bias2, bias3, bias4, output_matrix)
print(''.join(generated_sequence))

axypdouqtzn.
