In [2]:
# Jupyter notebook for GPT pretraining.

import os

from datasets.text_dataset import TextDataset
from tokenizers.char_tokenizer import CharTokenizer
from layers.gpt import GPT
from layers import layer_utils
from evaluate import Evaluator

from collections.abc import Callable
import torch, torch.nn as nn


# Define hyperparameters.

# Dataset hyper parameters
data_filename = 'testdata/tinyshakespeare.txt'
train_fraction = 0.9

# Tokenizer hyperparameters.
tokenizer = CharTokenizer(filename = data_filename)

# Architecture hyperparameters.
embedding_dimension = 64
num_heads = 8
head_dimension = 16
num_decoder_blocks = 10

# Training hyperparameters.
max_block_size = 24
batch_size = 32
num_batches_to_train = 500

# Evaluation hyperparameters.
num_batches_to_evaluate = 10
num_tokens_to_generate_during_evaluation = 10
num_batches_between_evaluations = 10

# Output parameters.
output_model_path = 'output/gpt.pt'
output_params_path = 'output/num_parameters.yaml'

# Fixing seed for reproducing results.
torch.manual_seed(123)

# Create directory corresponding to output_model_path if it does not exist.
model_dirname = os.path.dirname(output_model_path)
if not os.path.exists(model_dirname):
    os.makedirs(model_dirname)

# Generate train and validation datasets and data loaders.
train_dataset = TextDataset(max_block_size, tokenizer, 'train', train_fraction, filename = data_filename)
val_dataset = TextDataset(max_block_size, tokenizer, 'val', train_fraction, filename = data_filename)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle = True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle = True)

# Define the model architecture and optimizer.
model = GPT(num_decoder_blocks, tokenizer.vocabulary_length(), embedding_dimension, num_heads, head_dimension, max_block_size)
num_model_parameters = layer_utils.num_parameters(model, output_params_path)
print(f'Number of parameters in the model is {num_model_parameters["total_trainable_parameters"]}.')

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
evaluator = Evaluator()

# Perform model training and evaluation.
for (batch_index, train_batch) in enumerate(train_dataloader):
    if batch_index > num_batches_to_train:
        print('Reached maximum number of matches. Training is now complete.')
        torch.save(model.state_dict(), output_model_path)
        break

    train_features = train_batch['features']
    train_labels = train_batch['labels']
    predictions, loss = model(train_features, train_labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch_index % num_batches_between_evaluations == 0:
        (train_loss, val_loss) = evaluator.evaluate_train_and_validation_loss(train_dataloader, val_dataloader, model, num_batches_to_evaluate)
        generated_text = evaluator.generate_text(model, num_tokens_to_generate_during_evaluation, tokenizer)
        print(f' Batch index: {batch_index}, train loss: {train_loss}, val_loss: {val_loss}, generated text\n {generated_text}')


Number of parameters in the model is 509121.
 Batch index: 0, train loss: 4.119297027587891, val_loss: 4.1172614097595215, generated text
 
,:zisiOUOM
 Batch index: 10, train loss: 4.018176078796387, val_loss: 4.020906448364258, generated text
 
bHfi.eOQFx
 Batch index: 20, train loss: 3.9223999977111816, val_loss: 3.928990125656128, generated text
 

VakbdQh'l
 Batch index: 30, train loss: 3.8112730979919434, val_loss: 3.833385944366455, generated text
 
hu: pJXZBu
 Batch index: 40, train loss: 3.703054428100586, val_loss: 3.7184596061706543, generated text
 
&lwDnuIm.x
 Batch index: 50, train loss: 3.5785396099090576, val_loss: 3.5882728099823, generated text
 
 yeaHoi
O3
 Batch index: 60, train loss: 3.4777920246124268, val_loss: 3.496640682220459, generated text
 
ynEk sbaUa
 Batch index: 70, train loss: 3.3958580493927, val_loss: 3.4972081184387207, generated text
 
 VhenoI,
f
 Batch index: 80, train loss: 3.368961811065674, val_loss: 3.406102418899536, generated text
 
 $br  i&Lk

In [7]:
model_path = 'gpt.pt'
model = GPT(num_decoder_blocks, tokenizer.vocabulary_length(), embedding_dimension, num_heads, head_dimension, max_block_size)

model.load_state_dict(torch.load(model_path))

for (name, module) in model.named_modules():
    print(name, module)

 GPT(
  (token_embedding_layer): Embedding(65, 64)
  (positional_encoding_layer): Embedding(24, 64)
  (transformer_decoders): ModuleList(
    (0-9): 10 x TransformerDecoderBlock(
      (attention_layer): MultiHeadMaskedAttention(
        (Wq): Linear(in_features=64, out_features=128, bias=True)
        (Wk): Linear(in_features=64, out_features=128, bias=True)
        (Wv): Linear(in_features=64, out_features=128, bias=True)
        (head_merge_layer): Linear(in_features=128, out_features=64, bias=True)
      )
      (mlp_layer): MLP(
        (layer): Sequential(
          (0): Linear(in_features=64, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=64, bias=True)
        )
      )
      (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head_layer): Linear(in_features=64, out_features=65, bias=True)
)
token_embedding_layer Embedding(65, 64)
positional_encoding_layer Embedding(24, 64)
transformer_decoders 

In [4]:
model

GPT(
  (token_embedding_layer): Embedding(65, 64)
  (positional_encoding_layer): Embedding(24, 64)
  (transformer_decoders): ModuleList(
    (0-9): 10 x TransformerDecoderBlock(
      (attention_layer): MultiHeadMaskedAttention(
        (Wq): Linear(in_features=64, out_features=128, bias=True)
        (Wk): Linear(in_features=64, out_features=128, bias=True)
        (Wv): Linear(in_features=64, out_features=128, bias=True)
        (head_merge_layer): Linear(in_features=128, out_features=64, bias=True)
      )
      (mlp_layer): MLP(
        (layer): Sequential(
          (0): Linear(in_features=64, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=64, bias=True)
        )
      )
      (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head_layer): Linear(in_features=64, out_features=65, bias=True)
)

In [5]:
model.named_modules()

<generator object Module.named_modules at 0x7fcf4b85e6d0>

In [8]:
model.state_dict()

OrderedDict([('token_embedding_layer.weight',
              tensor([[ 0.3120, -0.1683, -0.3132,  ..., -0.8087,  0.1813,  0.2064],
                      [ 0.5178,  0.9949, -0.2497,  ...,  1.2679, -1.4586, -2.1651],
                      [-0.2390, -2.0628, -0.8271,  ...,  0.1260,  0.8500,  0.0474],
                      ...,
                      [-0.7930,  0.5108, -1.7304,  ...,  0.4013,  2.5323, -1.2270],
                      [ 0.3892,  0.0225, -0.6838,  ...,  0.2072, -0.0805, -0.7841],
                      [-1.2294, -0.6343, -0.1646,  ...,  0.1857, -0.7578,  0.5398]])),
             ('positional_encoding_layer.weight',
              tensor([[-1.0398, -0.6559,  0.9377,  ...,  0.1232, -1.2365, -0.1056],
                      [ 0.2216,  0.5529, -1.5980,  ..., -0.9717, -1.7982, -1.8113],
                      [-0.6465,  0.7719, -0.7789,  ..., -0.2921, -0.5079, -0.0046],
                      ...,
                      [-0.5420,  1.5233,  0.8172,  ...,  0.8380, -0.0911,  1.0546],
       