In [None]:
# optimizer and loss

def calc_loss_batch(input_batch, target_batch, model, device) -> float:
    input_batch = input_batch.to(device, non_blocking=True)
    target_batch = target_batch.to(device, non_blocking=True)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

import math

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        #print(f"probas.shape: {probas.shape}, " + str(probas[0, [0, 1, 2]]))
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
        #print(f"Token Ids:\n {idx_next} -> {idx}")
    return idx


END_OF_TEXT = '<|endoftext|>'
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={END_OF_TEXT})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # (T) -> (B, T)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # (B, T) -> (T)
    return tokenizer.decode(flat.tolist())


def calc_loss_batch(input_batch, target_batch, model, device) -> float:
    input_batch = input_batch.to(device, non_blocking=True)
    target_batch = target_batch.to(device, non_blocking=True)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss(loader, model, device, num_batches=None) -> float:
    i = 0
    total_loss = 0
    processed_batches = 0

    for i, (input_batch, target_batch) in enumerate(loader):
        if i >= num_batches: break

        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss
        processed_batches += 1

    return total_loss / processed_batches

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss(val_loader, model, device, num_batches=eval_iter)

    model.train()
    return train_loss, val_loss

def print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.position_embeddings.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=20, context_size=context_size)

    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print("decoded text: [" + decoded_text +"]\n")
    model.train()

def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer,
                start_epoch=0, initial_global_step=-1, initial_tokens_seen=0, initial_best_val_loss=math.inf,
                initial_train_losses=None, initial_val_losses=None, initial_track_tokens_seen=None,
                checkpoint_path="latest_checkpoint.pth", best_model_path="best_model_params.pth"):

    # Initialize from loaded/default states
    train_losses = initial_train_losses if initial_train_losses is not None else []
    val_losses = initial_val_losses if initial_val_losses is not None else []
    track_tokens_seen = initial_track_tokens_seen if initial_track_tokens_seen is not None else []
    tokens_seen = initial_tokens_seen
    global_step = initial_global_step
    best_val_loss = initial_best_val_loss

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            print(".", end="")

            input_batch = input_batch.to(device, non_blocking=True)
            target_batch = target_batch.to(device, non_blocking=True)

            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Common max_norm value
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1
            # xm.mark_step() # xla / tpu

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"epoch {epoch+1} step {global_step:06d}: train loss {train_loss:0.3f}, val loss: {val_loss:0.3f}")
                if val_loss < best_val_loss:
                    torch.save(model.state_dict(), BEST_MODEL_PATH)
                    best_val_loss = val_loss
                print_sample(model, tokenizer, device, start_context)
                save_checkpoint(epoch, global_step, model, optimizer, tokens_seen, best_val_loss,
                                train_losses, val_losses, track_tokens_seen, CHECKPOINT_PATH)

    return train_losses, val_losses, track_tokens_seen

import torch._dynamo
import os

#os.environ["TORCHDYNAMO_VERBOSE"] = "1"
#os.environ["TORCH_LOGS"] = "+dynamo,inductor" # Get logs from both
config['batch_size']=28
print(f"batch size: {config['batch_size']}")


print("Creating dataloaders ... ", end="")
train_loader = create_dataloader(train_dataset, tokenizer=enc,
                                    batch_size=config['batch_size'],
                                    max_length=config['context_length'],
                                    stride=config['context_length'])

val_loader = create_dataloader(val_dataset, tokenizer=enc,
                                    batch_size=config['batch_size'],
                                    max_length=config['context_length'],
                                    stride=config['context_length'])

print(f"Using device: {device}")
model = GPTModel(config)
model.to(device)
if device == 'cuda':
  model.to(torch.bfloat16)

model = torch.compile(model) #, backend='openxla')

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

CHECKPOINT_PATH = '/content/drive/MyDrive/colab/llm_e2e/training_checkpoint.pkl'
BEST_MODEL_PATH = '/content/drive/MyDrive/colab/llm_e2e/parameters.pth'
#loaded_states = load_checkpoint(CHECKPOINT_PATH, model, optimizer, device)

num_epochs=50


total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters: {total_params:,}')

# --- Start Training ---
# Pass the loaded states to train_model
train_losses_log, val_losses_log, tokens_seen_log = train_model(
    model, overfit_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=1, eval_iter=5,
    start_context="the fastest way to", tokenizer=enc,
    start_epoch=loaded_states['start_epoch'],
    initial_global_step=loaded_states['global_step'],
    initial_tokens_seen=loaded_states['tokens_seen'],
    initial_best_val_loss=loaded_states['best_val_loss'],
    initial_train_losses=loaded_states['train_losses'],
    initial_val_losses=loaded_states['val_losses'],
    initial_track_tokens_seen=loaded_states['track_tokens_seen'],
    checkpoint_path=CHECKPOINT_PATH,
    best_model_path=BEST_MODEL_PATH
)

print("Training finished.")





In [None]:
%run -n 00_config.ipynb
%run -n 01_data_pipeline.ipynb
%run -n 02_gpt2_model.ipynb

import torch
import tiktoken

@torch.no_grad
def estimate_loss(model, loader, eval_iters):
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters)
    for i, (X, Y) in enumerate(itertools.islice(loader, eval_iters)):
        logits, loss = model(X, Y)
        losses[i] = loss.item()
    model.train()
    return losses.mean()

def evaluate_model(model, train_loader, val_loader, eval_iters):
    train_loss = calc_loss(model, train_loader, eval_iter)
    val_loss = calc_loss(model, val_loader, eval_iter)
    return train_loss, val_loss
    
def train_model(model, train_loader, val_loader, optimizer, cfg):
    for epoch in range(cfg.num_epochs):
        running_loss = 0.0
        for i, (X, Y) in enumerate(train_loader):
    
            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            train_loss = estimate_loss(model, train_loader, cfg.eval_interval)
            loss.backward()
            optimizer.step()
    
            # print statistics
            running_loss += train_loss.item()
            if i % cfg.eval_interval == 0:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / cfg.eval_interval:.3f}')
                running_loss = 0.0

cfg = GPT2Config.from_yaml("gpt2_config.yaml")
gpt2 = tiktoken.get_encoding('gpt2')
tokenizer = lambda r: {'tokens': gpt2.encode_batch(r['text'], allowed_special={"<|endoftext|>"})} # endoftext may separate documents

train_loader = ShakespeareDataloader(batch_size=cfg.batch_size, sequence_length=cfg.context_length, tokenizer=tokenizer)
val_loader = ShakespeareDataloader(batch_size=cfg.batch_size, sequence_length=cfg.context_length, tokenizer=tokenizer, split='test')

model = GPTModel(cfg)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
train_model(model, train_loader, val_loader, optimizer, cfg=cfg)

print('Finished Training')


[2mResolved [1m145 packages[0m [2min 1ms[0m[0m
[2mAudited [1m140 packages[0m [2min 0.08ms[0m[0m
ShakespeareDataloader Initializing: karpathy/tiny_shakespeare with B=5, T=1024, split='train'
ShakespeareDataloader Pre-tokenizing text data n=1,003,854 for split 'train'... estimated batches: 58
ShakespeareDataloader iterator reset for split 'train', starting at token 0
Total tokens analyzed: 102,400
Unique tokens: 7013
Top 10 tokens:
  ID 198   ('\n'      ): 12,382 (0.1209)
  ID 11    (','       ): 5,909  (0.0577)
  ID 25    (':'       ): 3,139  (0.0307)
  ID 13    ('.'       ): 2,362  (0.0231)
  ID 262   (' the'    ): 1,753  (0.0171)
  ID 284   (' to'     ): 1,298  (0.0127)
  ID 286   (' of'     ): 1,090  (0.0106)
  ID 290   (' and'    ): 1,083  (0.0106)
  ID 26    (';'       ): 1,003  (0.0098)
  ID 314   (' I'      ): 997    (0.0097)
Vocabulary coverage: 0.13954275026364488
ShakespeareDataloader iterator reset for split 'train', starting at token 0
x: [5962, 22307, 25, 198, 8

In [12]:
gpt2.n_vocab

50257