# 👶 The Little Baby

> A barebones GPT-style LLM implementation — pure Python, zero dependencies.

In [None]:
import re
import math
import uuid
import json
import time
import numpy as np
import pickle as pk
from pathlib import Path as pt
from datetime import datetime as dt

In [None]:
########################
# Model Workflow
########################

model_load_workflow = input("Enter the model load workflow (train/inference/extract): ").strip().lower()

In [None]:
def is_valid_guid(guid_str):
    try:
        val = uuid.UUID(guid_str)
        return str(val) == guid_str  # Ensures exact format match
    except ValueError:
        return False

In [None]:
########################
# Hyper‐parameters
########################

if model_load_workflow == "train":
    # Generate a UUID (random UUID)
    model_config_uuid = uuid.uuid4()
    config = {
        # model parameters 
        "n_ctx": 128,          # used for chunking in the training phase and for positional embeddings in the inference phase | x2 the time of batch & size of the model
        "n_emb": 128,         # embedding dimension for each token which is for each character | x2 the time of batch & size of the model
        "dropout": 0.1,       # dropout probability
        "head_size": 128,     # total projection dim (to be split into heads) | x2 the time of batch from a certain point & size of the model
        "n_heads": 16,        # number of attention heads
        "n_layers": 4,       # number of transformer layers | x2 the time of batch from a certain point & size of the model

        # training parameters
        "num_epochs": 1,      # number of epochs to train
        "batch_size": 16,     # batch size of words to train | x2 the time of batch
        "lr": 1e-3            # learning rate
    }
elif model_load_workflow == "inference" or model_load_workflow == "extract":
    # Provide the UUID or Last
    model_config_uuid_inpt = input("Enter the model configuration (<uuid>/last): ").lower()

    # Load the last UUID and Prompt
    if model_config_uuid_inpt == "last":
        with open(f'config/last.json', 'r') as f:
            last = json.load(f)
            model_config_uuid_last = last['last_uuid']            
            prompt_last = last['last_prompt']
            model_config_uuid = model_config_uuid_last
    else:
        model_config_uuid = model_config_uuid_inpt
    
    # Load the configuration from a file
    if is_valid_guid(model_config_uuid):
        with open(f'outputs/report_{model_config_uuid}.json', 'r') as f:
            report = json.load(f)
            config = report['config']
    else:
        print(f"Invalid UUID: {model_config_uuid}")
        exit(1)

In [None]:
########################
# Model GUID
########################

print(f"Model GUID: {model_config_uuid}")

In [None]:
########################
# Tokenization
########################

# Create a dummy input.txt for demonstration purposes
# In a real scenario, this file would be provided by the user.
try:
    with open('inputs/input.txt', 'r', encoding='utf-8') as f:
        text = f.read()
except FileNotFoundError:
    print("inputs/input.txt not found. Creating a dummy file for demonstration.")
    dummy_text = "This is a sample text for training a tiny GPT model. It contains various characters and will be used to demonstrate the backpropagation implementation. The quick brown fox jumps over the lazy dog. This text is long enough to create some batches for training. We hope this works! Backpropagation is fundamental to training neural networks. It allows us to compute gradients efficiently."
    with open('input.txt', 'w', encoding='utf-8') as f:
        f.write(dummy_text)
    text = dummy_text # Use the dummy text directly

vocab = sorted(list(set(text)))
vocab_size = len(vocab)
itos = {i: c for i, c in enumerate(vocab)}  # int-to-string
stoi = {c: i for i, c in enumerate(vocab)}  # string-to-int
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
data = encode(text)
split = int(0.9 * len(data))
train_data = data[:split]
val_data = data[split:]

########################
# Data Preparation
########################

def prepare_data(data, n_ctx):
    X, y = [], []
    # Ensure there's enough data for at least one full context length + 1 for target
    if len(data) < n_ctx + 1:
        # Return empty arrays with correct shapes for concatenation later
        return np.array([], dtype=np.int32).reshape(0, n_ctx), np.array([], dtype=np.int32).reshape(0, n_ctx)

    for i in range(0, len(data) - n_ctx):
        X.append(data[i:i+n_ctx])
        y.append(data[i+1:i+n_ctx+1])
    return np.array(X, dtype=np.int32), np.array(y, dtype=np.int32)

X_train, y_train = prepare_data(train_data, config["n_ctx"])
X_val, y_val = prepare_data(val_data, config["n_ctx"])

def get_batches(X, y, b_size, shuffle=True):
    N = X.shape[0]
    if N == 0: # Handle empty data case
        return # Yield nothing if no data
    indices = np.arange(N)
    if shuffle:
        np.random.shuffle(indices)
    for i in range(0, N, b_size):
        batch_idx = indices[i:i+b_size]
        yield X[batch_idx], y[batch_idx]

In [None]:
########################
# Base Module 
########################

class Module:
    """
    Base class for all neural network modules.
    Manages parameters and training/evaluation mode.
    """
    def __init__(self):
        self.training = True # Default mode for modules
        self._parameters = [] # List to store trainable parameters

    def parameters(self):
        """
        Returns a list of all trainable parameters in the module.
        This method should be overridden by subclasses.
        """
        return self._parameters

    def train(self, mode=True):
        """
        Sets the module and all its sub-modules to training mode.
        If mode is False, sets to evaluation mode.
        """
        self.training = mode
        # Recursively set training mode for any sub-modules
        for param in self._parameters: # Iterate through parameters, which might be other Modules
            if isinstance(param, Module):
                param.train(mode)
            # For actual numpy arrays (weights/biases), the `training` flag is used by Dropout
            # and other layers that behave differently in train/eval.

    def eval(self):
        """Sets the module to evaluation mode."""
        self.train(False)

In [None]:
########################
# Helper Functions
########################

# GELU activation and its derivative
def gelu(x):
    """Gaussian Error Linear Unit (GELU) activation function."""
    return 0.5 * x * (1.0 + np.tanh(math.sqrt(2/math.pi) * (x + 0.044715 * np.power(x, 3))))

def gelu_prime(x):
    """Derivative of the GELU activation function."""
    # This is an approximation of the derivative, common in practice.
    # The exact derivative involves erf, which is complex.
    # This simplified version uses the derivative of tanh.
    k = math.sqrt(2/math.pi) * (x + 0.044715 * np.power(x, 3))
    sech_sq = 1 / np.cosh(k)**2 # sech^2(x) = 1 / cosh^2(x)
    k_prime = math.sqrt(2/math.pi) * (1 + 3 * 0.044715 * np.power(x, 2))
    return 0.5 * (1 + np.tanh(k)) + 0.5 * x * sech_sq * k_prime

# Softmax (along given axis)
def softmax(x, axis=-1):
    """Computes softmax probabilities along a given axis for numerical stability."""
    x_max = np.max(x, axis=axis, keepdims=True)
    e_x = np.exp(x - x_max)
    return e_x / np.sum(e_x, axis=axis, keepdims=True)

In [None]:
########################
# Layers
########################

class Embedding(Module):
    """
    A simple Embedding layer.
    Maps integer indices to dense vectors.
    """
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        # Initialize weights with small random values
        self.weight = np.random.randn(num_embeddings, embedding_dim) * 0.02
        self._parameters = [self.weight] # Register weight as a parameter

    def forward(self, x):
        """
        Forward pass for Embedding layer.
        x: input indices (e.g., token IDs), shape (B, T) or (T,)
        Returns: embedded vectors, shape (B, T, embedding_dim) or (T, embedding_dim)
        """
        self._cache = x # Store input indices for backward pass
        return self.weight[x]

    def backward(self, grad_output):
        """
        Backward pass for Embedding layer.
        grad_output: gradient from the subsequent layer, shape (B, T, embedding_dim)
        Returns: (grad_input, [grad_weight])
        """
        x = self._cache # Retrieve input indices
        grad_weight = np.zeros_like(self.weight) # Initialize gradient for weights

        # Accumulate gradients for each embedding used.
        # This is a sparse update: only update rows corresponding to input indices.
        if x.ndim == 1: # Handle (T,) input case
            for i, idx in enumerate(x):
                grad_weight[idx] += grad_output[i]
        else: # Handle (B, T) input case
            for b in range(x.shape[0]):
                for t in range(x.shape[1]):
                    grad_weight[x[b, t]] += grad_output[b, t]

        # For embedding layer, there's no gradient to pass back to the input (it's just indices).
        return None, [grad_weight]

class Linear(Module):
    """
    A simple Linear (fully connected) layer.
    Performs y = x @ W + b.
    """
    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        # Initialize weights with small random values
        self.weight = np.random.randn(in_features, out_features) * 0.02
        self.bias = np.zeros(out_features) if bias else None
        self._parameters = [self.weight]
        if self.bias is not None:
            self._parameters.append(self.bias)

    def forward(self, x):
        """
        Forward pass for Linear layer.
        x: input tensor, shape (..., in_features)
        Returns: output tensor, shape (..., out_features)
        """
        self._cache = x # Store input for backward pass
        out = x.dot(self.weight)
        if self.bias is not None:
            out = out + self.bias
        return out

    def backward(self, grad_output):
        """
        Backward pass for Linear layer.
        grad_output: gradient from the subsequent layer, shape (..., out_features)
        Returns: (grad_input, [grad_weight, grad_bias])
        """
        x = self._cache # Retrieve input
        original_x_shape = x.shape

        # Reshape x and grad_output to 2D for matrix multiplication for gradients
        # This handles arbitrary leading dimensions (B, T, ...)
        x_reshaped = x.reshape(-1, original_x_shape[-1]) # (N, in_features)
        grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1]) # (N, out_features)

        # Gradient for weight: dL/dW = x.T @ dL/dy
        grad_weight = x_reshaped.T @ grad_output_reshaped

        grad_bias = None
        if self.bias is not None:
            # Gradient for bias: dL/db = sum(dL/dy) along all dimensions except the last
            grad_bias = np.sum(grad_output_reshaped, axis=0)

        # Gradient for input: dL/dx = dL/dy @ W.T
        grad_input = grad_output_reshaped @ self.weight.T
        grad_input = grad_input.reshape(original_x_shape) # Reshape back to original input shape

        param_grads = [grad_weight]
        if grad_bias is not None:
            param_grads.append(grad_bias)

        return grad_input, param_grads

class LayerNorm(Module):
    """
    Layer Normalization layer.
    Normalizes features across the last dimension.
    """
    def __init__(self, dims, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = np.ones(dims)  # Learnable scaling parameter
        self.beta = np.zeros(dims)  # Learnable shifting parameter
        self._parameters = [self.gamma, self.beta]

    def forward(self, x):
        """
        Forward pass for LayerNorm.
        x: input tensor, shape (..., dims)
        Returns: normalized tensor, same shape as x
        """
        mean = x.mean(axis=-1, keepdims=True)
        var = x.var(axis=-1, keepdims=True)
        std = np.sqrt(var + self.eps)
        x_norm = (x - mean) / std # Normalized input
        out = self.gamma * x_norm + self.beta

        # Store intermediate values for backward pass
        self._cache = (x, mean, std, x_norm)
        return out

    def backward(self, grad_output):
        """
        Backward pass for LayerNorm.
        grad_output: gradient from subsequent layer, same shape as forward output.
        Returns: (grad_input, [grad_gamma, grad_beta])
        """
        x, mean, std, x_norm = self._cache

        # Gradients for gamma and beta (sum over all dimensions except the last)
        grad_gamma = np.sum(grad_output * x_norm, axis=tuple(range(grad_output.ndim - 1)))
        grad_beta = np.sum(grad_output, axis=tuple(range(grad_output.ndim - 1)))

        # Gradient for x_norm (before gamma/beta scaling)
        grad_x_norm = grad_output * self.gamma

        # Gradient for x (through normalization formula)
        # This is a common and numerically stable way to compute LayerNorm's grad_x
        N = x.shape[-1] # Number of features in the last dimension

        # Part 1: Gradient through (x - mean) / std
        grad_x = grad_x_norm / std

        # Part 2: Gradient through std (which comes from var)
        grad_var = np.sum(grad_x_norm * (x - mean) * (-0.5) * (std**(-3)), axis=-1, keepdims=True)

        # Part 3: Gradient through mean
        grad_mean = np.sum(grad_x_norm * (-1 / std), axis=-1, keepdims=True) + grad_var * (-2 / N) * np.sum(x - mean, axis=-1, keepdims=True)

        grad_x += (2 / N) * grad_var * (x - mean)
        grad_x += (1 / N) * grad_mean

        return grad_x, [grad_gamma, grad_beta]

class Dropout(Module):
    """
    Dropout layer.
    Randomly sets a fraction of input units to zero during training.
    """
    def __init__(self, p):
        super().__init__()
        self.p = p # Dropout probability

    def forward(self, x):
        """
        Forward pass for Dropout.
        x: input tensor
        Returns: output tensor with dropout applied (if training)
        """
        if self.training and self.p > 0:
            # Create a mask: True for elements to keep, False for elements to drop
            # Scale by 1/(1-p) during training (inverted dropout)
            mask = (np.random.rand(*x.shape) >= self.p) / (1.0 - self.p)
            self._cache = mask # Store mask for backward pass
            return x * mask
        self._cache = None # No mask if not training or p=0
        return x

    def backward(self, grad_output):
        """
        Backward pass for Dropout.
        grad_output: gradient from subsequent layer.
        Returns: (grad_input, []) - no parameters to update.
        """
        if self._cache is not None: # Only apply mask if dropout was active in forward
            mask = self._cache
            grad_input = grad_output * mask
        else: # If dropout was not active, simply pass gradient through
            grad_input = grad_output
        return grad_input, [] # Dropout has no parameters


In [None]:
########################
# Attention Mechanism
########################

class MultiHeadAttention(Module):
    """
    Multi-Head Self-Attention mechanism.
    Computes attention scores and combines information from different "heads".
    """
    def __init__(self, n_emb, head_size, n_heads, n_ctx, dropout_p):
        super().__init__()
        self.n_emb = n_emb
        self.head_size = head_size  # total projection dim (e.g. 128)
        self.n_heads = n_heads
        self.d_k = head_size // n_heads  # dimension per head (e.g. 32)

        # Linear projections for Query, Key, Value
        self.q_proj = Linear(n_emb, head_size, bias=False)
        self.k_proj = Linear(n_emb, head_size, bias=False)
        self.v_proj = Linear(n_emb, head_size, bias=False)

        # Output projection
        self.c_proj = Linear(head_size, n_emb)

        self.attn_dropout = Dropout(dropout_p)
        self.resid_dropout = Dropout(dropout_p)

        # Causal mask to prevent looking ahead in sequence (for decoder-only models)
        causal_mask = np.triu(np.ones((n_ctx, n_ctx)) * -1e9, k=1)
        self.causal_mask = causal_mask

    def parameters(self):
        """Returns all parameters of the attention module."""
        return (self.q_proj.parameters() +
                self.k_proj.parameters() +
                self.v_proj.parameters() +
                self.c_proj.parameters())

    def train(self, mode=True):
        """Sets the attention module and its sub-modules to training/eval mode."""
        super().train(mode) # Call base Module train to set self.training
        self.q_proj.train(mode)
        self.k_proj.train(mode)
        self.v_proj.train(mode)
        self.c_proj.train(mode)
        self.attn_dropout.train(mode)
        self.resid_dropout.train(mode)

    def forward(self, x):
        """
        Forward pass for MultiHeadAttention.
        x: input tensor, shape (B, T, n_emb)
        Returns: output tensor, shape (B, T, n_emb)
        """
        B, T, _ = x.shape

        # Project input to Q, K, V
        Q_orig = self.q_proj.forward(x)  # (B, T, head_size)
        K_orig = self.k_proj.forward(x)
        V_orig = self.v_proj.forward(x)

        # Helper function to split heads and transpose
        def split_heads(z):
            B_s, T_s, H_s = z.shape
            z = z.reshape(B_s, T_s, self.n_heads, self.d_k)
            return z.transpose(0, 2, 1, 3) # (B, n_heads, T, d_k)

        Q = split_heads(Q_orig)
        K = split_heads(K_orig)
        V = split_heads(V_orig)

        # Compute scaled dot-product attention scores
        # (B, n_heads, T, d_k) @ (B, n_heads, d_k, T) -> (B, n_heads, T, T)
        scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / math.sqrt(self.d_k)

        # Apply causal mask (prevents attending to future tokens)
        masked_scores = scores + self.causal_mask[:T, :T]

        attn_weights = softmax(masked_scores, axis=-1)
        attn_weights_dropped = self.attn_dropout.forward(attn_weights)

        # Compute weighted sum of values
        # (B, n_heads, T, T) @ (B, n_heads, T, d_k) -> (B, n_heads, T, d_k)
        o = np.matmul(attn_weights_dropped, V)

        # Recombine heads: transpose and reshape back to (B, T, head_size)
        o_combined = o.transpose(0, 2, 1, 3).reshape(B, T, self.head_size)

        # Final linear projection
        out = self.c_proj.forward(o_combined)
        out_dropped = self.resid_dropout.forward(out)

        # Store all intermediate values needed for backward pass
        self._cache = (x, Q_orig, K_orig, V_orig, Q, K, V, scores, masked_scores, attn_weights, attn_weights_dropped, o, o_combined)
        return out_dropped

    def backward(self, grad_output):
        """
        Backward pass for MultiHeadAttention.
        grad_output: gradient from the subsequent layer, shape (B, T, n_emb)
        Returns: (grad_input, list_of_param_grads)
        """
        (x, Q_orig, K_orig, V_orig, Q, K, V, scores, masked_scores, attn_weights, attn_weights_dropped, o, o_combined) = self._cache

        # Gradients will be collected in the order of self.parameters(): q_proj, k_proj, v_proj, c_proj
        current_mha_param_grads = []

        # 1. Backward through resid_dropout
        grad_out_dropped, _ = self.resid_dropout.backward(grad_output) # Dropout has no params

        # 2. Backward through c_proj (output linear layer)
        grad_o_combined, c_proj_grads = self.c_proj.backward(grad_out_dropped)

        # 3. Undo reshape/transpose for o_combined to get grad_o
        # grad_o_combined: (B, T, head_size)
        # grad_o: (B, n_heads, T, d_k)
        B, T, H = grad_o_combined.shape
        grad_o = grad_o_combined.reshape(B, T, self.n_heads, self.d_k).transpose(0, 2, 1, 3)

        # 4. Backward through matmul(attn_weights_dropped, V)
        # o = A @ V  => dL/dA = dL/do @ V.T, dL/dV = A.T @ dL/do
        grad_attn_weights_dropped = np.matmul(grad_o, V.transpose(0, 1, 3, 2))
        grad_V = np.matmul(attn_weights_dropped.transpose(0, 1, 3, 2), grad_o)

        # 5. Backward through attn_dropout
        grad_attn_weights, _ = self.attn_dropout.backward(grad_attn_weights_dropped)

        # 6. Backward through softmax (attn_weights = softmax(masked_scores))
        # dL/dx = y * (dL/dy - sum(dL/dy * y)) where y = softmax(x)
        grad_masked_scores = grad_attn_weights * attn_weights - np.sum(grad_attn_weights * attn_weights, axis=-1, keepdims=True) * attn_weights

        # 7. Backward through scores + causal_mask (causal_mask is constant, so its gradient is 0)
        grad_scores = grad_masked_scores

        # 8. Backward through scaled dot-product: scores = (Q @ K.T) / sqrt(d_k)
        # Let S = Q @ K.T / sqrt(d_k)
        # dL/dQ = (dL/dS @ K) / sqrt(d_k)
        # dL/dK = (dL/dS.T @ Q) / sqrt(d_k)
        grad_Q = np.matmul(grad_scores, K) / math.sqrt(self.d_k)
        grad_K = np.matmul(grad_scores.transpose(0, 1, 3, 2), Q) / math.sqrt(self.d_k)

        # 9. Undo split_heads for Q, K, V to get gradients for original Q_orig, K_orig, V_orig
        def un_split_heads(z_grad, original_shape):
            B_s, NH_s, T_s, DK_s = z_grad.shape
            z_grad = z_grad.transpose(0, 2, 1, 3) # (B, T, n_heads, d_k)
            return z_grad.reshape(original_shape) # (B, T, head_size)

        grad_Q_orig = un_split_heads(grad_Q, Q_orig.shape)
        grad_K_orig = un_split_heads(grad_K, K_orig.shape)
        grad_V_orig = un_split_heads(grad_V, V_orig.shape)

        # 10. Backward through q_proj, k_proj, v_proj
        grad_x_q, q_proj_grads = self.q_proj.backward(grad_Q_orig)
        grad_x_k, k_proj_grads = self.k_proj.backward(grad_K_orig)
        grad_x_v, v_proj_grads = self.v_proj.backward(grad_V_orig)

        # Sum gradients for the input 'x' from all three paths (Q, K, V)
        grad_x = grad_x_q + grad_x_k + grad_x_v

        # Assemble gradients in the correct order: q_proj, k_proj, v_proj, c_proj
        current_mha_param_grads.extend(q_proj_grads)
        current_mha_param_grads.extend(k_proj_grads)
        current_mha_param_grads.extend(v_proj_grads)
        current_mha_param_grads.extend(c_proj_grads)

        return grad_x, current_mha_param_grads

In [None]:
########################
# Multi-Layer Perceptron (MLP)
########################

class MLP(Module):
    """
    Multi-Layer Perceptron block, typically used in Transformer after attention.
    Consists of two linear layers with GELU activation and dropout.
    """
    def __init__(self, n_emb, dropout_p):
        super().__init__()
        # First linear layer (expands dimension)
        self.c_fc = Linear(n_emb, 4 * n_emb)
        # Second linear layer (projects back to original dimension)
        self.c_proj = Linear(4 * n_emb, n_emb)
        self.dropout = Dropout(dropout_p)

    def parameters(self):
        """Returns all parameters of the MLP module."""
        return self.c_fc.parameters() + self.c_proj.parameters()

    def train(self, mode=True):
        """Sets the MLP module and its sub-modules to training/eval mode."""
        super().train(mode)
        self.c_fc.train(mode)
        self.c_proj.train(mode)
        self.dropout.train(mode)

    def forward(self, x):
        """
        Forward pass for MLP.
        x: input tensor, shape (B, T, n_emb)
        Returns: output tensor, shape (B, T, n_emb)
        """
        self._cache_x = x # Store input to MLP for backward pass

        fc_out = self.c_fc.forward(x)
        gelu_out = gelu(fc_out)
        proj_out = self.c_proj.forward(gelu_out)
        dropped_out = self.dropout.forward(proj_out)

        # Store intermediate results for backward pass
        self._cache = (fc_out, gelu_out, proj_out)
        return dropped_out

    def backward(self, grad_output):
        """
        Backward pass for MLP.
        grad_output: gradient from subsequent layer.
        Returns: (grad_input, list_of_param_grads)
        """
        x = self._cache_x
        fc_out, gelu_out, proj_out = self._cache

        # Gradients will be collected in the order of self.parameters(): c_fc, c_proj
        current_mlp_param_grads = []

        # 1. Backward through dropout
        grad_proj_out, _ = self.dropout.backward(grad_output)

        # 2. Backward through c_proj
        grad_gelu_out, c_proj_grads = self.c_proj.backward(grad_proj_out)


        # 3. Backward through GELU activation
        # dL/dx = dL/dy * gelu_prime(x)
        grad_fc_out = grad_gelu_out * gelu_prime(fc_out)

        # 4. Backward through c_fc
        grad_x, c_fc_grads = self.c_fc.backward(grad_fc_out)

        # Assemble gradients in the correct order: c_fc, c_proj
        current_mlp_param_grads.extend(c_fc_grads)
        current_mlp_param_grads.extend(c_proj_grads)

        return grad_x, current_mlp_param_grads

In [None]:
########################
# Transformer Block
########################

class Block(Module):
    """
    A single Transformer Block.
    Consists of LayerNorm, MultiHeadAttention, another LayerNorm, and an MLP.
    Includes residual connections.
    """
    def __init__(self, n_emb, head_size, n_heads, n_ctx, dropout_p):
        super().__init__()
        self.ln_1 = LayerNorm(n_emb)
        self.mha = MultiHeadAttention(n_emb, head_size, n_heads, n_ctx, dropout_p)
        self.ln_2 = LayerNorm(n_emb)
        self.mlp = MLP(n_emb, dropout_p)

    def parameters(self):
        """Returns all parameters of the Transformer Block."""
        return (self.ln_1.parameters() +
                self.mha.parameters() + # Order changed for consistency
                self.ln_2.parameters() +
                self.mlp.parameters())

    def train(self, mode=True):
        """Sets the block and its sub-modules to training/eval mode."""
        super().train(mode)
        self.ln_1.train(mode)
        self.ln_2.train(mode)
        self.mha.train(mode)
        self.mlp.train(mode)

    def forward(self, x):
        """
        Forward pass for a Transformer Block.
        x: input tensor, shape (B, T, n_emb)
        Returns: output tensor, shape (B, T, n_emb)
        """
        self._cache_x = x # Store input for the first residual connection

        # First residual connection: x + MHA(LayerNorm(x))
        ln1_out = self.ln_1.forward(x)
        mha_out = self.mha.forward(ln1_out)
        x_res1 = x + mha_out # Residual connection 1

        # Second residual connection: x_res1 + MLP(LayerNorm(x_res1))
        ln2_out = self.ln_2.forward(x_res1)
        mlp_out = self.mlp.forward(ln2_out)
        out = x_res1 + mlp_out # Residual connection 2

        # Store intermediate values for backward pass
        self._cache = (ln1_out, mha_out, ln2_out, mlp_out, x_res1)
        return out

    def backward(self, grad_output):
        """
        Backward pass for a Transformer Block.
        grad_output: gradient from subsequent layer.
        Returns: (grad_input, list_of_param_grads)
        """
        x = self._cache_x
        ln1_out, mha_out, ln2_out, mlp_out, x_res1 = self._cache

        # Gradients will be collected in the order of self.parameters(): ln_1, mha, ln_2, mlp
        current_block_param_grads = []

        # 1. Backward through second residual connection and MLP
        grad_x_res1_from_res2 = grad_output
        grad_mlp_out = grad_output

        grad_ln2_out, mlp_grads = self.mlp.backward(grad_mlp_out)

        grad_x_res1_from_ln2, ln2_grads = self.ln_2.backward(grad_ln2_out)

        # Sum gradients for x_res1 from both paths
        grad_x_res1 = grad_x_res1_from_res2 + grad_x_res1_from_ln2

        # 2. Backward through first residual connection and MHA
        grad_x_from_res1 = grad_x_res1
        grad_mha_out = grad_x_res1

        grad_ln1_out, mha_grads = self.mha.backward(grad_mha_out)

        grad_x_from_ln1, ln1_grads = self.ln_1.backward(grad_ln1_out)

        # Sum gradients for the initial input 'x' from both paths
        grad_x = grad_x_from_res1 + grad_x_from_ln1

        # Assemble gradients in the correct order: ln_1, mha, ln_2, mlp
        current_block_param_grads.extend(ln1_grads)
        current_block_param_grads.extend(mha_grads)
        current_block_param_grads.extend(ln2_grads)
        current_block_param_grads.extend(mlp_grads)

        return grad_x, current_block_param_grads

In [None]:
########################
# GPT Model Definition
########################

class GPT(Module):
    """
    A minimal GPT (Generative Pre-trained Transformer) model.
    """
    def __init__(self, vocab_size, n_ctx, n_emb, n_layers, head_size, n_heads, dropout_p):
        super().__init__()
        self.n_ctx = n_ctx
        self.wte = Embedding(vocab_size, n_emb)    # Token embeddings
        self.wpe = Embedding(n_ctx, n_emb)        # Positional embeddings
        self.blocks = [Block(n_emb, head_size, n_heads, n_ctx, dropout_p)
                       for _ in range(n_layers)]    # Stack of Transformer blocks
        self.ln_f = LayerNorm(n_emb)                # Final Layer Normalization
        self.lm_head = Linear(n_emb, vocab_size)    # Language modeling head (output logits)

    def parameters(self):
        """Returns all parameters of the GPT model."""
        params = []
        params += self.wte.parameters()
        params += self.wpe.parameters()
        for block in self.blocks:
            params += block.parameters()
        params += self.ln_f.parameters()
        params += self.lm_head.parameters()
        return params

    def train(self, mode=True):
        """Sets the GPT model and all its sub-modules to training/eval mode."""
        super().train(mode) # Call base Module train to set self.training
        self.wte.train(mode)
        self.wpe.train(mode)
        for block in self.blocks:
            block.train(mode)
        self.ln_f.train(mode)
        self.lm_head.train(mode)

    def forward(self, x):
        """
        Forward pass for the GPT model.
        x: input token IDs, shape (B, T)
        Returns: logits, shape (B, T, vocab_size)
        """
        B, T = x.shape
        tok_emb = self.wte.forward(x)           # (B, T, n_emb)
        pos_indices = np.arange(T)              # Positional indices for current sequence length
        pos_emb = self.wpe.forward(pos_indices) # (T, n_emb)

        # Combine token and position embeddings (positional embeddings are broadcasted)
        x_combined_emb = tok_emb + pos_emb

        # Pass through Transformer blocks
        current_x = x_combined_emb
        # We need to store the output of each block to correctly backpropagate through the sequential blocks.
        # However, the Block's backward method only needs its *own* input gradient, not the full history.
        # The chain rule handles this sequentially.
        for block in self.blocks:
            current_x = block.forward(current_x)

        ln_f_out = self.ln_f.forward(current_x)
        logits = self.lm_head.forward(ln_f_out)  # (B, T, vocab_size)

        # Store intermediate values for backward pass
        self._cache = (x_combined_emb, current_x, ln_f_out)
        return logits

    def backward(self, grad_output):
        """
        Backward pass for the GPT model.
        grad_output: gradient from the loss function, shape (B, T, vocab_size)
        Returns: (None, list_of_param_grads) - no grad_input for the whole model.
        """
        (x_combined_emb, current_x_before_lnf, ln_f_out) = self._cache

        # Initialize an empty list to store gradients in the correct order
        # The order must match self.parameters()
        ordered_param_grads = []

        # 1. Backward through lm_head
        grad_ln_f_out, lm_head_grads = self.lm_head.backward(grad_output)

        # 2. Backward through ln_f (final LayerNorm)
        grad_current_x_before_lnf, ln_f_grads = self.ln_f.backward(grad_ln_f_out)

        # 3. Backward through blocks in reverse order
        # Need to store block gradients temporarily in correct order (forward pass order)
        # to match how self.blocks are added in parameters()
        block_grads_temp = [None] * len(self.blocks) # Temporary storage for block gradients
        grad_for_prev_block = grad_current_x_before_lnf
        for i in reversed(range(len(self.blocks))):
            block = self.blocks[i]
            grad_for_prev_block, current_block_grads = block.backward(grad_for_prev_block)
            block_grads_temp[i] = current_block_grads # Store in forward order index

        # 4. Backward through token + position embeddings addition
        # grad_for_prev_block is now the gradient for x_combined_emb (tok_emb + pos_emb)
        grad_tok_emb = grad_for_prev_block # Gradient for token embeddings
        # For position embeddings, sum gradients over the batch dimension
        grad_pos_emb = np.sum(grad_for_prev_block, axis=0)

        # 5. Backward through wte (token embeddings) and wpe (position embeddings)
        # Embedding.backward returns (None, [grad_weight])
        _, wte_grads = self.wte.backward(grad_tok_emb)
        _, wpe_grads = self.wpe.backward(grad_pos_emb)

        # Now, assemble the ordered_param_grads list in the same order as self.parameters()
        ordered_param_grads.extend(wte_grads)
        ordered_param_grads.extend(wpe_grads)
        for grads_list_for_block in block_grads_temp: # These are already in forward order
            ordered_param_grads.extend(grads_list_for_block)
        ordered_param_grads.extend(ln_f_grads)
        ordered_param_grads.extend(lm_head_grads)

        return None, ordered_param_grads # No grad_input for the entire model

    def generate(self, prompt, max_new_tokens):
        """
        Generates new tokens based on the model's learned probabilities.
        prompt_ids: input sequence of token IDs to start generation.
        max_new_tokens: maximum number of tokens to generate.
        Returns: generated sequence of token IDs.
        """
        # Set model to evaluation mode for generation (disables dropout)
        self.eval()
        # Start with a starting token (here we use index 0, assuming it's a valid token)
        # This could be a special <SOS> token in a more robust implementation.
        if prompt is None:
            ctx = np.zeros((1, 1), dtype=np.int32) # Initial context: a single token
        else:
            encode_ = lambda s: np.array([stoi[c] for c in s]).reshape(1, -1)
            prompt_ids = encode_(prompt)  # Your tokenizer should return shape (1, prompt_length)
            ctx = prompt_ids # Initial context: sequence of tokens

        for _ in range(max_new_tokens):
            # Use the last self.n_ctx tokens as input (or all if shorter)
            # This handles the fixed context window of the model
            input_seq = ctx[:, -self.n_ctx:]

            logits = self.forward(input_seq)

            # Get logits for the last token in the sequence (the one to predict)
            logits = logits[:, -1, :] # Shape: (1, vocab_size)

            # Convert logits to probabilities
            probs = softmax(logits, axis=-1).flatten() # Flatten to (vocab_size,)

            # Sample the next token based on probabilities
            next_tok = np.random.choice(np.arange(probs.shape[0]), p=probs)

            # Append the new token to the context
            ctx = np.concatenate([ctx, np.array([[next_tok]], dtype=np.int32)], axis=1)

        # Set model back to training mode after generation
        self.train()
        return ctx

In [None]:
########################
# Loss Function
########################

def cross_entropy_loss(logits, targets):
    """
    Computes cross-entropy loss and its gradient with respect to logits.
    logits: (B, T, vocab_size) - raw predictions from the model
    targets: (B, T) - true token IDs
    Returns: (loss_value, grad_logits)
    """
    B, T, C = logits.shape
    logits_flat = logits.reshape(B * T, C)
    targets_flat = targets.reshape(B * T)

    # For numerical stability: subtract max logit from all logits before exponentiation
    logits_max = np.max(logits_flat, axis=1, keepdims=True)
    exp_logits = np.exp(logits_flat - logits_max)

    sum_exp_logits = np.sum(exp_logits, axis=1, keepdims=True)
    probs = exp_logits / sum_exp_logits # Softmax probabilities

    # Compute loss: - sum(target_one_hot * log(probs))
    log_probs = np.log(probs + 1e-9) # Add epsilon for numerical stability to avoid log(0)
    loss = -np.mean(log_probs[np.arange(B * T), targets_flat])

    # Compute gradient of cross-entropy loss with respect to logits
    # The derivative of Cross-Entropy + Softmax is (probs - one_hot_targets)
    one_hot_targets = np.zeros_like(probs)
    one_hot_targets[np.arange(B * T), targets_flat] = 1

    grad_logits = probs - one_hot_targets # Shape: (B*T, C)
    grad_logits = grad_logits.reshape(B, T, C) # Reshape back to (B, T, C)

    return loss, grad_logits

In [None]:
########################
# Calculate Value and Grand
########################

def value_and_grad(model, x, y):
    """
    Performs a forward pass to compute loss and then a backward pass
    to compute gradients for all model parameters.
    """
    # Forward pass
    logits = model.forward(x)
    # Compute loss and get initial gradient for logits from the loss function
    loss, grad_logits = cross_entropy_loss(logits, y)

    # Backward pass: The model's backward method takes the gradient from the loss
    # and propagates it back through all layers, returning gradients for parameters.
    _, grads = model.backward(grad_logits) # grad_input for model is None

    return loss, grads

In [None]:
########################
# Optimizer and Training Loop
########################

class AdamW:
    """
    AdamW optimizer implementation.
    Includes adaptive learning rates and weight decay.
    """
    def __init__(self, parameters, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.01):
        self.params = parameters # List of all trainable parameters (NumPy arrays)
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.weight_decay = weight_decay
        self.t = 0 # Timestep counter

        # Initialize first and second moment estimates for each parameter
        self.m = {id(p): np.zeros_like(p) for p in self.params}
        self.v = {id(p): np.zeros_like(p) for p in self.params}

    def step(self, grads):
        """
        Performs a single optimization step (parameter update).
        grads: a list of gradients, corresponding to self.params.
        """
        self.t += 1 # Increment timestep
        for p, g in zip(self.params, grads):
            pid = id(p) # Use object ID for unique parameter identification

            # Apply weight decay (L2 regularization)
            # This is applied directly to the gradient before the Adam update
            g = g + self.weight_decay * p

            # Update biased first moment estimate
            self.m[pid] = self.beta1 * self.m[pid] + (1 - self.beta1) * g
            # Update biased second raw moment estimate
            self.v[pid] = self.beta2 * self.v[pid] + (1 - self.beta2) * (g * g)

            # Compute bias-corrected first moment estimate
            m_hat = self.m[pid] / (1 - self.beta1 ** self.t)
            # Compute bias-corrected second raw moment estimate
            v_hat = self.v[pid] / (1 - self.beta2 ** self.t)

            # Update parameters
            p -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

In [None]:
# Instantiate the model
model = GPT(vocab_size, config["n_ctx"], config["n_emb"], config["n_layers"], config["head_size"], config["n_heads"], config["dropout"])

In [None]:
########################
# Training Loop
########################

if model_load_workflow == "train":
    params = model.parameters() # Get all trainable parameters from the model
    optimizer = AdamW(params, learning_rate=config["lr"])

    batch_logs = []
    epoch_logs = []
    epoch_total_time = 0

    print("--- Start training ---")
    for epoch in range(config["num_epochs"]):
        epoch_start_time = time.time()  # Record start time

        # Training phase
        model.train(True) # Set model to training mode (enables dropout)
        running_train_loss = 0
        train_batch_cnt = 0
        train_batch_all = 0

        train_batches = list(get_batches(X_train, y_train, config["batch_size"], shuffle=True))
        train_batch_all = len(train_batches)
        if not train_batches:
            print(f"Epoch {epoch:2} over {config["num_epochs"]:2} | No training data batches available. Skipping training for this epoch.")
            avg_train_loss = float('nan')
        else:
            for X_batch, y_batch in train_batches:
                train_batch_cnt += 1
                train_batch_start_time = time.time()  # Record start time
                # Compute loss and gradients
                loss, grads = value_and_grad(model, X_batch, y_batch)
                # Update model parameters using the optimizer
                optimizer.step(grads)
                running_train_loss += loss
                train_batch_stop_time = time.time()  # Record stop time
                train_batch_elapsed_time = train_batch_stop_time - train_batch_start_time
                # Print loss for the current batch
                batch_log = f"Batch {train_batch_cnt:2} over {train_batch_all:2} | train_loss = {loss:.4f} | execution_time = {train_batch_elapsed_time:.4f}"
                batch_logs.append(batch_log)
                print(batch_log)
            avg_train_loss = running_train_loss / train_batch_cnt

        # Validation phase
        model.train(False) # Set model to evaluation mode (disables dropout)
        running_val_loss = 0
        val_batch_cnt = 0
        val_batch_all = 0

        val_batches = list(get_batches(X_val, y_val, config["batch_size"], shuffle=False))
        val_batch_all = len(val_batches)
        if not val_batches:
            print(f"Epoch {(epoch+1):2} over {config["num_epochs"]:2} | No validation data batches available. Skipping validation for this epoch.")
            avg_val_loss = float('nan') # Indicate no validation was performed
        else:
            for X_batch, y_batch in val_batches:
                val_batch_cnt += 1
                val_batch_start_time = time.time()  # Record start time
                # In validation, only forward pass and loss computation are needed
                logits = model.forward(X_batch)
                loss, _ = cross_entropy_loss(logits, y_batch) # Don't need gradients for validation
                running_val_loss += loss
                val_batch_stop_time = time.time()  # Record stop time
                val_batch_elapsed_time = val_batch_stop_time - val_batch_start_time  
                # Print loss for the current batch
                batch_log = f"Batch {val_batch_cnt:2} over {val_batch_all:2} | val_loss = {loss:.4f} | execution_time = {val_batch_elapsed_time:.4f}"
                batch_logs.append(batch_log)
                print(batch_log)
            avg_val_loss = running_val_loss / val_batch_cnt
        
        epoch_stop_time = time.time()  # Record stop time
        epoch_elapsed_time = epoch_stop_time - epoch_start_time  
        epoch_total_time += epoch_elapsed_time

        # Print average losses for the epoch
        epoch_log = f"Epoch {(epoch+1):2} over {config["num_epochs"]:2} | train_loss = {avg_train_loss:.4f} | val_loss = {avg_val_loss:.4f} | execution_time = {epoch_elapsed_time:.4f}"
        epoch_logs.append(epoch_log)
        print(epoch_log)

    epoch_total_time_avg = epoch_total_time / config["num_epochs"]
    print(f"Average epoch time: {epoch_total_time_avg:.4f}")
    
    print("--- Stop training ---")

In [None]:
########################
# Save the report
########################

if model_load_workflow == "train":
    analysis = {
        "num_epochs": config["num_epochs"],
        "train_batches_per_epoch": train_batch_all,
        "val_batches_per_epoch": val_batch_all,
        "average_train_loss_per_epoch": avg_train_loss,
        "average_val_loss_per_epoch": avg_val_loss,
        "average_train_time_per_epoch": epoch_total_time_avg
    }
    
    report = {
        "config": config,
        "analysis": analysis,
        "batch_logs": batch_logs,
        "epoch_logs": epoch_logs
    }
    
    with open(f"outputs/report_{model_config_uuid}.json", 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=4) # indent for pretty printing

In [None]:
########################
# Save the model
########################

if model_load_workflow == "train":
    #Determine how to save the model based on the model_load_method
    model_load_method = "both"  # "both" or "object" or "weights"

    if model_load_method == "object" or model_load_method == "both":
        # This method stores the entire model into a single file
        with open(f'models/model_{model_config_uuid}_all.pkl', 'wb') as f:
            pk.dump(model, f)
    if model_load_method == "weights" or model_load_method == "both":        
        # Extract all weight arrays from the model
        weights_dict = {
            # Token and position embeddings
            'wte_weight': model.wte.weight,
            'wpe_weight': model.wpe.weight,
            
            # Final layer norm
            'ln_f_gamma': model.ln_f.gamma,
            'ln_f_beta': model.ln_f.beta,
            
            # Language model head
            'lm_head_weight': model.lm_head.weight,
            'lm_head_bias': model.lm_head.bias if model.lm_head.bias is not None else None,
        }
        
        # Add block weights
        for i, block in enumerate(model.blocks):
            # Layer norms
            weights_dict[f'block_{i}_ln1_gamma'] = block.ln_1.gamma
            weights_dict[f'block_{i}_ln1_beta'] = block.ln_1.beta
            weights_dict[f'block_{i}_ln2_gamma'] = block.ln_2.gamma
            weights_dict[f'block_{i}_ln2_beta'] = block.ln_2.beta
            
            # Multi-head attention
            weights_dict[f'block_{i}_mha_q_weight'] = block.mha.q_proj.weight
            weights_dict[f'block_{i}_mha_k_weight'] = block.mha.k_proj.weight
            weights_dict[f'block_{i}_mha_v_weight'] = block.mha.v_proj.weight
            weights_dict[f'block_{i}_mha_c_weight'] = block.mha.c_proj.weight
            weights_dict[f'block_{i}_mha_c_bias'] = block.mha.c_proj.bias if block.mha.c_proj.bias is not None else None
            
            # MLP
            weights_dict[f'block_{i}_mlp_fc_weight'] = block.mlp.c_fc.weight
            weights_dict[f'block_{i}_mlp_fc_bias'] = block.mlp.c_fc.bias if block.mlp.c_fc.bias is not None else None
            weights_dict[f'block_{i}_mlp_proj_weight'] = block.mlp.c_proj.weight
            weights_dict[f'block_{i}_mlp_proj_bias'] = block.mlp.c_proj.bias if block.mlp.c_proj.bias is not None else None
        
        # Convert numpy arrays to lists for JSON serialization
        json_weights_dict = {}
        for key, value in weights_dict.items():
            if value is not None and hasattr(value, 'tolist'):
                json_weights_dict[key] = value.tolist()
            else:
                json_weights_dict[key] = value

        # Save weights only
        with open(f'models/{model_config_uuid}.weights', 'w') as f:
            json.dump(json_weights_dict, f, indent=4)

In [None]:
########################
# Load the model
########################

if model_load_workflow == "train" or model_load_workflow == "inference":
    #Determine how to load the model based on the model_load_method
    model_load_method = "weights" # "both" or "object" or "weights"

    if model_load_method == "object" or model_load_method == "both":
        # This method loads the entire model from a single file
        with open(f'models/model_{model_config_uuid}.pkl', 'rb') as f:
            model = pk.load(f)
    if model_load_method == "weights" or model_load_method == "both":
        with open(f'models/model_{model_config_uuid}.weights', 'r') as f:
            json_weights_dict = json.load(f)

        # Convert lists back to numpy arrays
        weights_dict = {}
        for key, value in json_weights_dict.items():
            if value is not None and isinstance(value, list):
                weights_dict[key] = np.array(value, dtype=np.float32)
            else:
                weights_dict[key] = value
            
        # Restore weights to the model
        model.wte.weight = weights_dict['wte_weight']
        model.wpe.weight = weights_dict['wpe_weight']
        model.ln_f.gamma = weights_dict['ln_f_gamma']
        model.ln_f.beta = weights_dict['ln_f_beta']
        model.lm_head.weight = weights_dict['lm_head_weight']
        if weights_dict['lm_head_bias'] is not None:
            model.lm_head.bias = weights_dict['lm_head_bias']
        
        # Restore block weights
        for i, block in enumerate(model.blocks):
            # Layer norms
            block.ln_1.gamma = weights_dict[f'block_{i}_ln1_gamma']
            block.ln_1.beta = weights_dict[f'block_{i}_ln1_beta']
            block.ln_2.gamma = weights_dict[f'block_{i}_ln2_gamma']
            block.ln_2.beta = weights_dict[f'block_{i}_ln2_beta']
            
            # Multi-head attention
            block.mha.q_proj.weight = weights_dict[f'block_{i}_mha_q_weight']
            block.mha.k_proj.weight = weights_dict[f'block_{i}_mha_k_weight']
            block.mha.v_proj.weight = weights_dict[f'block_{i}_mha_v_weight']
            block.mha.c_proj.weight = weights_dict[f'block_{i}_mha_c_weight']
            if weights_dict[f'block_{i}_mha_c_bias'] is not None:
                block.mha.c_proj.bias = weights_dict[f'block_{i}_mha_c_bias']
            
            # MLP
            block.mlp.c_fc.weight = weights_dict[f'block_{i}_mlp_fc_weight']
            if weights_dict[f'block_{i}_mlp_fc_bias'] is not None:
                block.mlp.c_fc.bias = weights_dict[f'block_{i}_mlp_fc_bias']
            block.mlp.c_proj.weight = weights_dict[f'block_{i}_mlp_proj_weight']
            if weights_dict[f'block_{i}_mlp_proj_bias'] is not None:
                block.mlp.c_proj.bias = weights_dict[f'block_{i}_mlp_proj_bias']

In [None]:
########################
# Extract model components
########################

if model_load_workflow == "extract":    
    model_load_workflow_type = input("Enter the extract type (from_pkl_obj/from_pkl_wgt): ").lower()
    
    model_config_uuids = []
    if model_config_uuid == "all":
        # Define the directory
        directory = pt('models')

        # Regex pattern for GUID
        pattern = re.compile(r'model_([a-f0-9\-]{36})\.pkl')

        # Find matching files and extract GUIDs
        for file in directory.glob('model_*.pkl'):
            match = pattern.match(file.name)
            if match:
                model_config_uuids.append(match.group(1))

    # Loop through each GUID to extract weights
    for model_config_uuid in model_config_uuids:
        if model_load_workflow_type == "from_pkl_obj":
            # Load the model from a single file
            with open(f'models/model_{model_config_uuid}.pkl', 'rb') as f:
                model_ = pk.load(f)

            # Extract all weight arrays from the model
            weights_dict = {
                # Token and position embeddings
                'wte_weight': model_.wte.weight,
                'wpe_weight': model_.wpe.weight,
                
                # Final layer norm
                'ln_f_gamma': model_.ln_f.gamma,
                'ln_f_beta': model_.ln_f.beta,
                
                # Language model head
                'lm_head_weight': model_.lm_head.weight,
                'lm_head_bias': model_.lm_head.bias if model_.lm_head.bias is not None else None,
            }

            # Add block weights
            for i, block in enumerate(model_.blocks):
                # Layer norms
                weights_dict[f'block_{i}_ln1_gamma'] = block.ln_1.gamma
                weights_dict[f'block_{i}_ln1_beta'] = block.ln_1.beta
                weights_dict[f'block_{i}_ln2_gamma'] = block.ln_2.gamma
                weights_dict[f'block_{i}_ln2_beta'] = block.ln_2.beta
                
                # Multi-head attention
                weights_dict[f'block_{i}_mha_q_weight'] = block.mha.q_proj.weight
                weights_dict[f'block_{i}_mha_k_weight'] = block.mha.k_proj.weight
                weights_dict[f'block_{i}_mha_v_weight'] = block.mha.v_proj.weight
                weights_dict[f'block_{i}_mha_c_weight'] = block.mha.c_proj.weight
                weights_dict[f'block_{i}_mha_c_bias'] = block.mha.c_proj.bias if block.mha.c_proj.bias is not None else None
                
                # MLP
                weights_dict[f'block_{i}_mlp_fc_weight'] = block.mlp.c_fc.weight
                weights_dict[f'block_{i}_mlp_fc_bias'] = block.mlp.c_fc.bias if block.mlp.c_fc.bias is not None else None
                weights_dict[f'block_{i}_mlp_proj_weight'] = block.mlp.c_proj.weight
                weights_dict[f'block_{i}_mlp_proj_bias'] = block.mlp.c_proj.bias if block.mlp.c_proj.bias is not None else None

            # Store the model weights to a single file
            with open(f'models/model_{model_config_uuid}.weights', 'w') as f:
                json.dump(weights_dict, f, indent=4)

        elif model_load_workflow_type == "from_pkl_wgt":
            with open(f'models/model_{model_config_uuid}.pkl', 'rb') as f:
                weights_dict = pk.load(f)

            # Convert numpy arrays to lists for JSON serialization
            json_weights_dict = {}
            for key, value in weights_dict.items():
                if value is not None and hasattr(value, 'tolist'):
                    json_weights_dict[key] = value.tolist()
                else:
                    json_weights_dict[key] = value

            # Store the model weights to a single file
            with open(f'models/model_{model_config_uuid}.weights', 'w') as f:
                json.dump(json_weights_dict, f, indent=4)

In [None]:
########################
# Inference (Generation)
########################

inference_total_time = 0
if model_load_workflow == "train" or model_load_workflow == "inference":
    print("--- Start Inference ---")
    
    # Generate 500 new tokens
    inference_max_tokens = 500

    if model_load_workflow == "train":
        prompt = None
        completion_output_name = f"outputs/completion_{model_config_uuid}.json"
    elif model_load_workflow == "inference":
        prompt_inpt = input("Enter the prompt for inference (<prompt>/last/none): ").lower()
        if prompt_inpt == "last":
            prompt = prompt_last
        elif prompt_inpt == "none":
            prompt = None
        else:
            prompt = prompt_inpt
        
        today = dt.today()
        today_ft = today.strftime('%Y%m%d%H%M%S')
        completion_output_name = f"outputs/completion_{model_config_uuid}_{today_ft}.json"
    
    inference_start_time = time.time()  # Record start time

    # Generate text based on given token ids
    generation_ids = model.generate(prompt, inference_max_tokens)
    
    # Decode the generated token IDs back to text
    generation = decode(generation_ids[0].tolist())

    inference_stop_time = time.time()  # Record stop time
    inference_elapsed_time = inference_stop_time - inference_start_time  
    inference_total_time += inference_elapsed_time

    # Create a completion object with the prompt, generated text, and inference time
    completion = { "prompt": prompt, "generation": generation, "inference_max_tokens": inference_max_tokens, "inference_total_time": inference_total_time }

    print(completion)
    print("--- Stop Inference ---")

    # Save the completion text to a file
    with open(completion_output_name, 'w', encoding='utf-8') as f:
        json.dump(completion, f, indent=4)

In [None]:
# Save the last settings text to a file
if is_valid_guid(model_config_uuid):
    last = { "last_uuid": str(model_config_uuid), "last_prompt": str(prompt) }
    with open('config/last.json', 'w', encoding='utf-8') as f:
        json.dump(last, f, indent=4)