In [1]:
# Dataset: source code for 'requests' python libraray
text = open("data/requests.txt").read()

In [2]:
import numpy as np

# sorted list of all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# string-to-integer mapping
stoi = { char:i for i,char in enumerate(chars) }

# integer-to-string mapping
itos = { i:char for i,char in enumerate(chars) }

# lookup functions for the mappings
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# encode the entire text file and convert to a numpy array
data = np.array(encode(text))

In [3]:
import numpy as np

# Each character has weights of a 32 long vector, defined by n_embed (embedding dimension)
n_embd = 32

# Initialize embedding & unembedding matrix
embedding_matrix = np.random.randn(vocab_size, n_embd)
unembedding_matrix = np.random.randn(n_embd, vocab_size)



In [4]:
# Standard expansion factor of four
ffwd_expansion_factor = 4

# Initialize hidden layer and output layer
# Use Kaiming init to intelligently scale the layer's random weights
W1 = np.random.randn(n_embd, n_embd * ffwd_expansion_factor) * np.sqrt(2.0 / n_embd)
W2 = np.random.randn(n_embd * ffwd_expansion_factor, n_embd) * np.sqrt(2.0 / n_embd)
 

In [93]:
def cross_entropy_loss(y_pred, y_true):

    # Add a small epsilon to the prediction to avoid log(0), which is undefined.
    epsilon = 1e-9
    
    # cross-entropy formula
    loss = -np.sum(y_true * np.log(y_pred + epsilon))
    
    return loss

In [125]:
class Model:
    def __init__(self,embedding_matrix, unembedding_matrix, W1, W2):
        self.embedding_matrix = embedding_matrix
        self.unembedding_matrix = unembedding_matrix
        self.W1 = W1
        self.W2 = W2

    def forward(self, x_batch):
        
        # Output shape: (B, T, n_embd)
        embd = self.embedding_matrix[x_batch]

        hidden = embd @ self.W1
        hidden_activated = np.maximum(0, hidden)
        processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)

        # Final projection to logits
        logits = processed_vectors @ self.unembedding_matrix
        
        return logits

    def pred (self, x):

        logits = self.forward(x)

        ## Apply softmax function to logits
        stable_logits = logits - np.max(logits) # This ensures the largest logit is 0
        raw_preds = np.exp(stable_logits) / np.sum(np.exp(stable_logits))        
        preds = {}

        for idx, raw_pred in enumerate(raw_preds):

            preds[itos[idx]] = raw_pred
        
        return preds

    def loss (self, logits, y_batch):

        # Get the dimensions for indexing
        B, T, C = logits.shape

        # Stable softmax
        max_logits = np.max(logits, axis=-1, keepdims=True)
        stable_logits = logits - max_logits
        exp_logits = np.exp(stable_logits)
        probabilities = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
    
        # Get the probabilities for the correct target characters using efficient indexing
        correct_char_probs = probabilities[np.arange(B)[:, None], np.arange(T), y_batch]

        # Calculate negative log likelihood
        loss_array = -np.log(correct_char_probs + 1e-9)
    
        # Average the loss over the whole batch to get a single number
        mean_loss = np.mean(loss_array)
    
        # Return probabilities because they are the starting point for backpropagation
        return mean_loss, probabilities


In [126]:
model = Model(embedding_matrix, unembedding_matrix, W1, W2)
# Get next character predictions for 'd'
logits = model.forward([stoi['d'],stoi['d']])

  hidden = embd @ self.W1
  hidden = embd @ self.W1
  hidden = embd @ self.W1
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  logits = processed_vectors @ self.unembedding_matrix
  logits = processed_vectors @ self.unembedding_matrix
  logits = processed_vectors @ self.unembedding_matrix


In [109]:
import random

def get_batch(data, batch_size, block_size):

    x_batch = []
    y_batch = []

    # Generate batchs 
    for block in [0] * batch_size:

        # Get random range in datast of size=block_size
        slice_idx = random.randrange(0, len(data) - block_size)
        x_batch.append(data[slice_idx:slice_idx+block_size])
        y_batch.append(data[slice_idx+1:slice_idx+block_size+1])

    return np.array(x_batch), np.array(y_batch)



In [131]:
# Training hyperparameters
max_iters = 5000
learning_rate = 1e-3 # A common starting point for learning rate
eval_interval = 500  # How often we'll print the loss
batch_size = 4
block_size = 8

# Training loop
for step in range(max_iters):
    
    # Get a mini-batch of data
    x_batch, y_batch = get_batch(data, batch_size, block_size)

    
    # Calculate loss and probabilites
    logits = model.forward(x_batch)
    loss_initial, probabilities = model.loss(logits, y_batch)

    print(f"Loss: {loss_initial}")

    one_hot_array = np.eye(vocab_size)[y_batch]

    print(one_hot_array
    initial_gradient = predicted_probabilities - one_hot_array

    
          
    # Backward Pass
    model.backward()
    
    # Optimizer step (Updated weights with gradients)
    for param in model.parameters():
        param -= learning_rate * param.grad 
        
    # Print out the loss periodically
    if step % eval_interval == 0:
        print(f"Step {step}: Training Loss = {loss}")



[[[ 16.23490803   5.42676734  28.87936671 ...  -2.94016662  30.17489851
   -19.97881587]
  [ 14.33743883  -2.44574008  17.00839391 ...  -0.85616978  26.47614349
   -20.21357627]
  [ 45.80885095  17.923195     7.61935221 ...  24.74539571  12.13482784
     2.22955933]
  ...
  [  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]
  [  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]
  [  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]]

 [[  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]
  [  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]
  [  4.85144256 -14.60702026  -8.63936801 ...   6.11538346   2.898202
    21.59168967]
  ...
  [  8.04645693  12.72710851   1.98806376 ... -10.02812498   7.78929489
    -3.81596985]
  [  1.39216115  -1.3638196   22.15307323 ...   6.26748186   6.99444994
   -10.81259746]
  [  1.39216115  -1

  hidden = embd @ self.W1
  hidden = embd @ self.W1
  hidden = embd @ self.W1
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  processed_vectors = hidden_activated @ self.W2 # Shape: (B, T, n_embd)
  logits = processed_vectors @ self.unembedding_matrix
  logits = processed_vectors @ self.unembedding_matrix
  logits = processed_vectors @ self.unembedding_matrix


NameError: name 'predicted_probabilities' is not defined