In [1]:
import numpy as np

from transformer_from_scratch import (
    create_input_output_pairs,
    create_vocabulary,
    cross_entropy_loss,
    load_dataset,
    NumpyGPT,
    one_hot_encode,
    text_to_indices,
    tokenize,
)

In [2]:
# Load and preprocess the dataset
file_path = "dataset.txt"
text = load_dataset(file_path)
tokens = tokenize(text)
vocab, token_to_idx, idx_to_token = create_vocabulary(tokens)
indices = text_to_indices(text, token_to_idx)
input_data, output_data = create_input_output_pairs(indices, seq_len=128)

In [3]:
# Initialize the NumpyGPT model
model = NumpyGPT(vocab_size=len(vocab), d_model=512, nhead=8, num_layers=4)

In [4]:
# Set the hyperparameters
learning_rate = 1e-3
batch_size = 64
num_epochs = 10

In [5]:
# training loop
num_samples = input_data.shape[0]

for epoch in range(num_epochs):
    # Shuffle the dataset
    shuffle_indices = np.random.permutation(num_samples)
    input_data = input_data[shuffle_indices]
    output_data = output_data[shuffle_indices]

    for i in range(0, num_samples, batch_size):
        # Get the current batch
        inputs = input_data[i:i + batch_size]
        targets = output_data[i:i + batch_size]

        # One-hot encode the inputs
        inputs_one_hot = one_hot_encode(inputs, len(vocab))

        # Forward pass
        logits = model.forward(inputs_one_hot)

        # Compute the loss
        loss = cross_entropy_loss(logits, targets)

        # Backward pass
        d_embedding, d_transformer_layers, d_fc_out = model.backward(logits, targets)

        # Update the model parameters
        model.embedding -= learning_rate * d_embedding
        model.fc_out -= learning_rate * d_fc_out

        # Update the transformer layers
        for layer, (d_layer, d_att, d_norm1, d_norm2, d_ff) in zip(model.transformer_layers, d_transformer_layers):
            layer.attention.W_q -= learning_rate * d_att[0]
            layer.attention.W_k -= learning_rate * d_att[1]
            layer.attention.W_v -= learning_rate * d_att[2]
            layer.norm1.gamma -= learning_rate * d_norm1[0]
            layer.norm1.beta -= learning_rate * d_norm1[1]
            layer.norm2.gamma -= learning_rate * d_norm2[0]
            layer.norm2.beta -= learning_rate * d_norm2[1]
            layer.feed_forward.fc1 -= learning_rate * d_ff[0]
            layer.feed_forward.fc2 -= learning_rate * d_ff[1]

        if i % 1000 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Batch {i + 1}/{num_samples}, Loss: {loss:.4f}")

ValueError: operands could not be broadcast together with shapes (64,128,13331) (64,128,512) 