## Imports


In [215]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [216]:
import sys

sys.path.append("..")

from pathlib import Path
import matplotlib.pyplot as plt
from tinygrad import Tensor, dtypes, Device

from model.llm import LLM
from model.tokenizer import Tokenizer, train_tokenizer

from helpers.dataset import NextTokenPredictionDataset
from helpers.trainer import train
from helpers.config import LLMConfig, TrainingConfig
from helpers.dataloader import DataLoader

print(Device.DEFAULT)

METAL


## Set config


In [217]:
# specifies achitecture and hyperparameters of the language model
llm_config = LLMConfig(
    # size of the vocab the model can understand
    vocab_size=4096,
    # max sequence length of the input tokens; # of tokens model can process in 1 forward pass
    seq_len=128,
    # dimensionality of embedding vectors; each token in vocab is repped by vector of this size
    dim_emb=256,
    # num of layers (or transformer blocks) in the model; each consists of sub-layers like self-attention and feedforward
    num_layers=4,
    # num of attention heads in the multi-head attention mechanism
    num_heads=8,
    # dropout rate applied to embedding layer to prevent overfitting
    emb_dropout=0.0,
    # dimensionality of hidden layer in feedforward network; typically a multiple of 'dim-emb'
    ffn_dim_hidden=4 * 256,
    # whether to include a bias term in the feedforward network layers
    ffn_bias=False,
)

# specifies parameters and settings for training the language model
train_config = TrainingConfig(
    # whether to retrain the tokenizer or not 
    retrain_tokenizer=False,
    # num of samples per batch of training
    batch_size=64,
    # learning rate for the optimizer; controls how much to adjust the weights w/ respect to the loss gradient
    learning_rate=3e-4,
    # weight decay parameter; helps prevent overfitting by penalizing large weights
    weight_decay=1e-5,
    # max number of epochs (full passes thru training dataset) to train the model for
    max_epochs=1,
    # freq of logging training process (1 = logging after every batch/epoch)
    log_frequency=1,
)

## Prepare tokenizer and dataset


In [218]:
# specifies path to input text file used for training or retraining tokenizer
input_file = "../data/shakespeare/tinyshakespeare.txt"
# creates new file path for tokenizer model by changing suffix of input file
output_file = Path(input_file).with_suffix(".model")

# checks whether tokenizer model file alr exists or if config specifies to retrain tokenizer
if not output_file.exists() or train_config.retrain_tokenizer:
    # train tokenizer sing input text file and specified vocab size from 'LLMConfig'
    train_tokenizer(input_file, llm_config.vocab_size)

# initialize tokenizer by loading it from 'output_file'
tokenizer = Tokenizer(str(output_file))

In [219]:
# defines a string 'sentence' that will be tokenized
sentence = "Before we proceed any further, hear me speak."
# uses 'EncodeAsPieces' method from 'tokenizer.sp' object to tokenize sentence into tokens and outputs them
print(tokenizer.sp.EncodeAsPieces(sentence))

# ensures that the encoding and decoding returns the original sentence
assert tokenizer.decode(tokenizer.encode(sentence)) == sentence

['▁Before', '▁we', '▁proceed', '▁any', '▁further', ',', '▁hear', '▁me', '▁speak', '.']


In [220]:
# This helper class allow to generate batches of inputs and targets where targets last element is the next token to predict

# initializes instance of 'NextTokenPredictionDataset' class; dataset responsible for generating batches of input sequences and corresponding target sequences for training
ds_train = NextTokenPredictionDataset(input_file, llm_config.seq_len, tokenizer)
# initializes 'Dataloader' instance that's responsible for creating batches of data from the dataset
dl_train = DataLoader(ds_train, batch_size=train_config.batch_size, shuffle=True)

# iterates over the batches generates by the data loader
for inputs, labels in dl_train:
    # prints shapes of 'inputs' and 'labels' tensors; helps verify dimensions of batches
    print(inputs.shape, labels.shape)
    break

(64, 128) (64, 128)


## Define model


In [221]:
# initializes instance of 'LLM' class; represents the language model
model = LLM(
    # size of the vocabulary (# of unique tokens)
    vocab_size=tokenizer.vocab_size,
    # max sequence length (# of tokens per input sequence)
    seq_len=llm_config.seq_len,
    # dimensionality of embeddings vectors
    dim_emb=llm_config.dim_emb,
    # number of layers (transformer block) in the model
    num_layers=llm_config.num_layers,
    # number of attention heads in the multi-head attention mechanism
    attn_num_heads=llm_config.num_heads,
    # dropout rate applied to the embedding layer
    emb_dropout=llm_config.emb_dropout,
    # dimensionality of the hidden layer in the feedforward network
    ffn_hidden_dim=llm_config.ffn_dim_hidden,
    # whether to include a bias term in the feedforward network layers
    ffn_bias=llm_config.ffn_bias,
)

## Count parameters

In [222]:
from prettytable import PrettyTable
import numpy as np

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0

    # Token embedding
    params = np.prod(model.token_embedding.weight.shape)
    table.add_row(["token_embedding.weight", params])
    total_params += params

    # Transformer blocks
    for i, block in enumerate(model.transformer_blocks):
        # Attention norm
        params = np.prod(block.norm_attn.gain.shape)
        table.add_row([f"transformer.{i}.norm_attn.gain", params])
        total_params += params

        # Multi-head attention
        params = np.prod(block.multihead_attn.proj_qkv.weight.shape)
        table.add_row([f"transformer.{i}.multihead_attn.proj_qkv.weight", params])
        total_params += params

        params = np.prod(block.multihead_attn.proj_out.weight.shape)
        table.add_row([f"transformer.{i}.multihead_attn.proj_out.weight", params])
        total_params += params

        # FFN norm
        params = np.prod(block.norm_ffn.gain.shape)
        table.add_row([f"transformer.{i}.norm_ffn.gain", params])
        total_params += params

        # Feed forward layers
        params = np.prod(block.feed_forward.linear1.weight.shape)
        table.add_row([f"transformer.{i}.feed_forward.linear1.weight", params])
        total_params += params

        params = np.prod(block.feed_forward.swiglu.linear.weight.shape)
        table.add_row([f"transformer.{i}.feed_forward.swiglu.linear.weight", params])
        total_params += params

        params = np.prod(block.feed_forward.swiglu.linear.bias.shape)
        table.add_row([f"transformer.{i}.feed_forward.swiglu.linear.bias", params])
        total_params += params

        params = np.prod(block.feed_forward.linear2.weight.shape)
        table.add_row([f"transformer.{i}.feed_forward.linear2.weight", params])
        total_params += params

    # Final norm
    params = np.prod(model.norm.gain.shape)
    table.add_row(["norm.gain", params])
    total_params += params

    # Projection head
    params = np.prod(model.projection_head.bias.shape)
    table.add_row(["projection_head.bias", params])
    total_params += params

    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

# Usage
count_parameters(model)


+-------------------------------------------------+------------+
|                     Modules                     | Parameters |
+-------------------------------------------------+------------+
|              token_embedding.weight             |  1048576   |
|           transformer.0.norm_attn.gain          |    256     |
|   transformer.0.multihead_attn.proj_qkv.weight  |   196608   |
|   transformer.0.multihead_attn.proj_out.weight  |   65536    |
|           transformer.0.norm_ffn.gain           |    256     |
|    transformer.0.feed_forward.linear1.weight    |   262144   |
| transformer.0.feed_forward.swiglu.linear.weight |  2097152   |
|  transformer.0.feed_forward.swiglu.linear.bias  |    2048    |
|    transformer.0.feed_forward.linear2.weight    |   262144   |
|           transformer.1.norm_attn.gain          |    256     |
|   transformer.1.multihead_attn.proj_qkv.weight  |   196608   |
|   transformer.1.multihead_attn.proj_out.weight  |   65536    |
|           transformer.1

np.int64(12597504)

## Train model


In [230]:
# calls 'train' fxn to train the language model
loss_history = train(
    # language model instance to be trained
    model,
    # data loader that provides batches of training data
    dl_train,
    # learning rate for the optimizer
    lr=train_config.learning_rate,
    # max num of epochs to train the model
    max_epochs=train_config.max_epochs,
    # weight decay parameter to prevent overfitting
    weight_decay=train_config.weight_decay,
    # frequency of logging training progress
    log_every=train_config.log_frequency,
)

Training on METAL.
Epoch 1/1:
Parameter <Tensor <LB METAL (1, 1, 4096, 1) int ShapeTracker(views=(View(shape=(1, 1, 4096, 1), strides=(0, 0, 1, 0), offset=0, mask=None, contiguous=True),))> on METAL with grad None> has no gradient
Parameter <Tensor <LB METAL (128, 128) bool (<BinaryOps.CMPLT: 6>, None)> on METAL with grad None> has no gradient
Parameter <Tensor <LB METAL (128, 128) bool (<BinaryOps.CMPLT: 6>, None)> on METAL with grad None> has no gradient
Parameter <Tensor <LB METAL (128, 128) bool (<BinaryOps.CMPLT: 6>, None)> on METAL with grad None> has no gradient
Parameter <Tensor <LB METAL (128, 128) bool (<BinaryOps.CMPLT: 6>, None)> on METAL with grad None> has no gradient


AssertionError: 

In [None]:
# creates new figure and axis object for the plot and sets the size of the figure
fig, ax = plt.subplots(figsize=(12, 4))
# line plots training loss over time
# x-axis: seq of ints from 0 to len of 'train_loss' list minus 1
# y-axis: recorded training loss values
ax.plot(range(len(loss_history["train_loss"])), loss_history["train_loss"])
# sets label for the x-axis 
ax.set_xlabel("step")
# sets label for the y-axis
ax.set_ylabel("cross entropy loss")
# adds horizontal grid lines to the plot, making it easier to read the y-axis values
ax.grid(axis="y")

## Play around


In [None]:
# empty prompt to generate random stuff
# create 2D tensor w/ 1 row and 'llm_config.seq_len' cols, filled w/ the end-of-seq token ID and containing 32-bit ints
prompt = torch.full((1, llm_config.seq_len), tokenizer.eos_id, dtype=torch.int32)
# moves tensor to specified device for efficient computation
prompt = prompt.to(train_config.device)

# generates seq of tokens using the model starting from the 'prompt'
out = model.generate(prompt, max_seq_len=64)
# decodes generated seq of token IDs back into a human-readable string
tokenizer.decode(out.tolist())

In [None]:
# generate from a prompt
# encodes string into seq of token IDs using the tokenizer
prompt = tokenizer.encode(
    # prompt text to be encoded 
    "KING HENRY VI:",
    # indicates this is the beginning of the string
    beg_of_string=True,
    # pads the sequence to the specified length
    pad_seq=True,
    # length to which the sequence should be padded or truncated
    seq_len=llm_config.seq_len,
)
# converts encoded prompt to a PyTorch tensor and moves it to the specified device
# 'torch.tensor(prompt, dtype=torch.int32)' converts prompt to a tensor w/ 32-bit int type
# '.unsqueeze(0)' adds extra dimension at the beginning, making the tensor shape '(1, seq_len)'
# '.to(train_config.device)' moves the tensor to the specified device
inputs = torch.tensor(prompt, dtype=torch.int32).unsqueeze(0).to(train_config.device)
# generates seq of tokens starting from the given prompt
out = model.generate(inputs, max_seq_len=64)
# decodes generated seq of token IDs back into a human-readable string
tokenizer.decode(out.tolist())