# DLRover's NanoGPT Training
This notebook provides an implementation of the NanoGPT model using DLRover's distributed training setup. The code is designed for both GPU and CPU environment.

## Setup and Dependencies
First, let's import the necessary libraries and set up the environment for distributed training.

In [3]:
import argparse
import math
import os
import pickle
import time
from datetime import timedelta
from collections import Counter
import wandb

import numpy as np
import torch
import torch.distributed as dist
from lora import apply_lora
from model import GPT, GPTConfig
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import requests
from contextlib import nullcontext

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


## Get Dataset

In [5]:
# download the tiny shakespeare dataset
input_file_path = os.path.join(os.getcwd(), 'data/input.txt')
# Check if the directory exists and if not, create it
directory = os.path.dirname(input_file_path)
if not os.path.exists(directory):
    os.makedirs(directory)

if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode both to integers
train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(os.getcwd(), 'data/train.bin'))
val_ids.tofile(os.path.join(os.getcwd(), 'data/val.bin'))

# save the meta information as well, to help us encode/decode later
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open(os.path.join(os.getcwd(), 'data/meta.pkl'), 'wb') as f:
    pickle.dump(meta, f)

# length of dataset in characters:  1115394
# all the unique characters:
#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab size: 65
# train has 1003854 tokens
# val has 111540 tokens


length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens


## Configuration

In [6]:

# default config values designed to train a gpt2 on shakespeare
# I/O
out_dir = 'out'
eval_interval = 200
log_interval = 1
eval_iters = 20
eval_only = False # if True, script exits right after the first eval
always_save_checkpoint = True # if True, always save a checkpoint after each eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
# wandb logging
wandb_log = True # disabled by default
wandb_project = 'shakespeare'
wandb_run_name = 'gpt2' # 'run' + str(time.time())
# data
dataset = 'shakespeare'
gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 64 # if you have a gpu then 1024 or higher
# model
n_layer = 4 # you can dial higher like 12 when using gpu
n_head = 2
n_embd = 128 # you can dial higher like 768 when using gpu
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer
learning_rate = 6e-4 # max learning rate
max_iters = 2000 # total number of training iterations, higher like 600000 when using gpu
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 2000 # how many steps to warm up for
lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# system
# this shows how to configure on a cpu machine, if you have a gpu then set device = "cuda" and compile = True
device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = False # use PyTorch 2.0 to compile the model to be faster
use_fsdp = False
use_lora = False


In [7]:
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys} # will be useful for logging
config

{'input_file_path': '/Users/ruiruizhang/Documents/dlrover/examples/pytorch/nanogpt/notebook/data/input.txt',
 'data_url': 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt',
 'directory': '/Users/ruiruizhang/Documents/dlrover/examples/pytorch/nanogpt/notebook/data',
 'vocab_size': 65,
 'n': 1115394,
 'out_dir': 'out',
 'eval_interval': 200,
 'log_interval': 1,
 'eval_iters': 20,
 'eval_only': False,
 'always_save_checkpoint': True,
 'init_from': 'scratch',
 'wandb_log': True,
 'wandb_project': 'shakespeare',
 'wandb_run_name': 'gpt2',
 'dataset': 'shakespeare',
 'gradient_accumulation_steps': 40,
 'batch_size': 12,
 'block_size': 64,
 'n_layer': 4,
 'n_head': 2,
 'n_embd': 128,
 'dropout': 0.0,
 'bias': False,
 'learning_rate': 0.0006,
 'max_iters': 2000,
 'weight_decay': 0.1,
 'beta1': 0.9,
 'beta2': 0.95,
 'grad_clip': 1.0,
 'decay_lr': True,
 'warmup_iters': 2000,
 'lr_decay_iters': 600000,
 'min_lr': 6e-05,
 'backend': 'nccl',
 'device': 'cp

In [8]:
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)


tokens per iteration will be: 30,720


## Data Loader
It uses np.memmap for efficient data loading, especially beneficial for large datasets.

In [10]:
train_data = np.memmap(os.path.join(os.getcwd(), 'data/train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(os.getcwd(), 'data/val.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

# attempt to derive vocab_size from the dataset
meta_path = os.path.join(os.getcwd(), 'data/meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")


found vocab_size = 65 (inside /Users/ruiruizhang/Documents/dlrover/examples/pytorch/nanogpt/notebook/data/meta.pkl)


## Model Initialization
initializes the GPT model and provides options for setting various model parameters.

In [11]:
# model init
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
if init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab size we'll use for from-scratch training
    if meta_vocab_size is None:
        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)

Initializing a new model from scratch
number of parameters: 0.80M
