In [None]:
# optimizer and loss

def calc_loss_batch(input_batch, target_batch, model, device) -> float:
    input_batch = input_batch.to(device, non_blocking=True)
    target_batch = target_batch.to(device, non_blocking=True)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

import math

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        #print(f"probas.shape: {probas.shape}, " + str(probas[0, [0, 1, 2]]))
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
        #print(f"Token Ids:\n {idx_next} -> {idx}")
    return idx


END_OF_TEXT = '<|endoftext|>'
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={END_OF_TEXT})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # (T) -> (B, T)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # (B, T) -> (T)
    return tokenizer.decode(flat.tolist())


def calc_loss_batch(input_batch, target_batch, model, device) -> float:
    input_batch = input_batch.to(device, non_blocking=True)
    target_batch = target_batch.to(device, non_blocking=True)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss(loader, model, device, num_batches=None) -> float:
    i = 0
    total_loss = 0
    processed_batches = 0

    for i, (input_batch, target_batch) in enumerate(loader):
        if i >= num_batches: break

        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss
        processed_batches += 1

    return total_loss / processed_batches

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss(val_loader, model, device, num_batches=eval_iter)

    model.train()
    return train_loss, val_loss

def print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.position_embeddings.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=20, context_size=context_size)

    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print("decoded text: [" + decoded_text +"]\n")
    model.train()

def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer,
                start_epoch=0, initial_global_step=-1, initial_tokens_seen=0, initial_best_val_loss=math.inf,
                initial_train_losses=None, initial_val_losses=None, initial_track_tokens_seen=None,
                checkpoint_path="latest_checkpoint.pth", best_model_path="best_model_params.pth"):

    # Initialize from loaded/default states
    train_losses = initial_train_losses if initial_train_losses is not None else []
    val_losses = initial_val_losses if initial_val_losses is not None else []
    track_tokens_seen = initial_track_tokens_seen if initial_track_tokens_seen is not None else []
    tokens_seen = initial_tokens_seen
    global_step = initial_global_step
    best_val_loss = initial_best_val_loss

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            print(".", end="")

            input_batch = input_batch.to(device, non_blocking=True)
            target_batch = target_batch.to(device, non_blocking=True)

            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Common max_norm value
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1
            # xm.mark_step() # xla / tpu

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"epoch {epoch+1} step {global_step:06d}: train loss {train_loss:0.3f}, val loss: {val_loss:0.3f}")
                if val_loss < best_val_loss:
                    torch.save(model.state_dict(), BEST_MODEL_PATH)
                    best_val_loss = val_loss
                print_sample(model, tokenizer, device, start_context)
                save_checkpoint(epoch, global_step, model, optimizer, tokens_seen, best_val_loss,
                                train_losses, val_losses, track_tokens_seen, CHECKPOINT_PATH)

    return train_losses, val_losses, track_tokens_seen

import torch._dynamo
import os

#os.environ["TORCHDYNAMO_VERBOSE"] = "1"
#os.environ["TORCH_LOGS"] = "+dynamo,inductor" # Get logs from both
config['batch_size']=28
print(f"batch size: {config['batch_size']}")


print("Creating dataloaders ... ", end="")
train_loader = create_dataloader(train_dataset, tokenizer=enc,
                                    batch_size=config['batch_size'],
                                    max_length=config['context_length'],
                                    stride=config['context_length'])

val_loader = create_dataloader(val_dataset, tokenizer=enc,
                                    batch_size=config['batch_size'],
                                    max_length=config['context_length'],
                                    stride=config['context_length'])

print(f"Using device: {device}")
model = GPTModel(config)
model.to(device)
if device == 'cuda':
  model.to(torch.bfloat16)

model = torch.compile(model) #, backend='openxla')

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

CHECKPOINT_PATH = '/content/drive/MyDrive/colab/llm_e2e/training_checkpoint.pkl'
BEST_MODEL_PATH = '/content/drive/MyDrive/colab/llm_e2e/parameters.pth'
#loaded_states = load_checkpoint(CHECKPOINT_PATH, model, optimizer, device)

num_epochs=50


total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters: {total_params:,}')

# --- Start Training ---
# Pass the loaded states to train_model
train_losses_log, val_losses_log, tokens_seen_log = train_model(
    model, overfit_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=1, eval_iter=5,
    start_context="the fastest way to", tokenizer=enc,
    start_epoch=loaded_states['start_epoch'],
    initial_global_step=loaded_states['global_step'],
    initial_tokens_seen=loaded_states['tokens_seen'],
    initial_best_val_loss=loaded_states['best_val_loss'],
    initial_train_losses=loaded_states['train_losses'],
    initial_val_losses=loaded_states['val_losses'],
    initial_track_tokens_seen=loaded_states['track_tokens_seen'],
    checkpoint_path=CHECKPOINT_PATH,
    best_model_path=BEST_MODEL_PATH
)

print("Training finished.")





In [1]:
#!pip install --upgrade tiktoken datasets fsspec
import torch
import tiktoken
import itertools

def estimate_loss(model, loader, device, eval_iters):
    model.eval()
    losses = torch.zeros(eval_iters)
    for i, (X, Y) in enumerate(itertools.islice(loader, eval_iters)):
        X, Y = X.to(device), Y.to(device)
        logits, loss = model(X, Y)
        losses[i] = loss.item()
    model.train()
    return losses.mean()

@torch.no_grad
def evaluate_model(model, train_loader, val_loader, device, eval_iters):
    """
    Args:
      model: to evaluate
      train_loader: training dataset iterator
      val_loader: validation dataset iterator
      eval_iters: the number of iterations to pull from the loaders
  
    Returns:
      dict with 'train' and 'val' loss 
  """
    train_loss = estimate_loss(model, train_loader, device, eval_iters)
    val_loss = estimate_loss(model, val_loader, device, eval_iters)
    return {'train': train_loss, 'val': val_loss}
    
def train_model(model, train_loader, val_loader, optimizer, cfg):
    device = torch.device(cfg.device)
    
    for epoch in range(cfg.num_epochs):
        model.train()
        running_loss = 0.0
        
        for i, (X, Y) in enumerate(train_loader):
            X, Y = X.to(cfg.device), Y.to(cfg.device)

            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            logits, loss = model(X, Y)
            loss.backward()
            optimizer.step()
    
            # print statistics
            running_loss += loss.item()
            if (i + 1) % cfg.log_iterval == 0:
                print(f"[{epoch + 1}  {i + 1:5d}]: running loss {running_loss / cfg.log_iterval:.3f}")
                running_loss = 0.0

            if (i + 1) % cfg.eval_interval == 0:
                losses = evaluate_model(model, train_loader, val_loader, device, eval_iters=cfg.eval_iters)
                print(f"[{epoch + 1}  {i + 1:5d}]: train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}, eval_iters: {cfg.eval_iters}")
                running_loss = 0.0

cfg = GPT2Config() #.from_yaml("gpt2_config.yaml")
gpt2 = tiktoken.get_encoding('gpt2')
tokenizer = lambda r: {'tokens': gpt2.encode_batch(r['text'], allowed_special={"<|endoftext|>"})} # endoftext may separate documents

train_loader = ShakespeareDataloader(batch_size=cfg.batch_size, sequence_length=cfg.context_length, tokenizer=tokenizer)
val_loader = ShakespeareDataloader(batch_size=cfg.batch_size, sequence_length=cfg.context_length, tokenizer=tokenizer, split='test')

model = GPTModel(cfg)
model.to(cfg.device)

optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
train_model(model, train_loader, val_loader, optimizer, cfg=cfg)

print('Finished Training')

[2mResolved [1m145 packages[0m [2min 5ms[0m[0m
[2mAudited [1m140 packages[0m [2min 0.04ms[0m[0m


  from .autonotebook import tqdm as notebook_tqdm


[2mResolved [1m145 packages[0m [2min 0.64ms[0m[0m
[2mAudited [1m140 packages[0m [2min 0.02ms[0m[0m
ShakespeareDataloader Initializing: karpathy/tiny_shakespeare with B=2, T=1024, split='train'
ShakespeareDataloader Pre-tokenizing text data n=1,003,854 for split 'train'... estimated batches: 147
ShakespeareDataloader iterator reset for split 'train', starting at token 0
[1,     2] loss: 10.401
[1,     4] loss: 9.000
[1,     6] loss: 8.250
[1,     8] loss: 7.472
[1,    10] loss: 6.812
[1,    12] loss: 6.420
[1,    14] loss: 6.330
[1,    16] loss: 6.334
[1,    18] loss: 6.507
[1,    20] loss: 6.340
[1,    22] loss: 6.652
[1,    24] loss: 7.109
[1,    26] loss: 7.529
[1,    28] loss: 7.452
[1,    30] loss: 8.287
[1,    32] loss: 6.981
[1,    34] loss: 6.785
[1,    36] loss: 6.492
[1,    38] loss: 6.590
[1,    40] loss: 6.584
[1,    42] loss: 6.820
[1,    44] loss: 6.569
[1,    46] loss: 6.606
[1,    48] loss: 6.715
[1,    50] loss: 6.712
[1,    52] loss: 6.632
[1,    54] loss: 

In [24]:
import torch
# Assuming tokenizer, model, and gpt2 (tiktoken encoding) are defined and on the CPU.

test = "First Citizen"

# Tokenize
# tkns will be like [[id1, id2, ...]]
tokenized_output = tokenizer({'text': [test]})
tkns_list_of_lists = tokenized_output['tokens']

# Convert to tensor (already on CPU by assumption)
input_ids = torch.tensor(tkns_list_of_lists, dtype=torch.long)

# Model inference
with torch.no_grad(): # Disable gradient calculations for inference
    # Assuming model(input_ids) returns (logits, ...) or just logits
    # If model directly returns logits: model_output = model(input_ids)
    # If model returns a tuple (logits, other_outputs): model_output = model(input_ids)[0]
    # The original code had [0], so we'll keep that structure, assuming logits are the first element.
    logits = model(input_ids)[0] # Logits shape: (batch_size, sequence_length, vocab_size)

# Convert logits to predicted token IDs
# predicted_token_ids shape: (batch_size, sequence_length)
predicted_token_ids = torch.argmax(logits, dim=-1)

# Decode the predicted token IDs
# gpt2.decode_batch expects a list of lists of integers.
decoded_texts = gpt2.decode_batch(predicted_token_ids.tolist())

print(f"Input text: '{test}'")
print(f"Original token IDs: {tkns_list_of_lists}")
print(f"Predicted token IDs: {predicted_token_ids.tolist()}")
print(f"Decoded output: {decoded_texts}")

Input text: 'First Citizen'
Original token IDs: [[5962, 22307]]
Predicted token IDs: [[198, 11]]
Decoded output: ['\n,']
