# Custom Model Notebook

Debug configuration here: [Configuration Notebook](project_config.ipynb)

Construct, test, and profile custom model architectures before pre-training.

In [None]:
# Note: This should be a path to a "model-project" Loading other project type will not work!
projects_directory =  "/home/dinalt/ai_assets/projects/train/two_layer/model_test"
config_template = ""

# The name of the model definition template to test (without the 'model' prefix)
model_config_template = "two_layer.yaml"

## Available Models

In [None]:
import forgather.nb.notebooks as nb

nb.display_model_project_index(projects_directory)

## Load Configuration

In [None]:
import sys, os
modules_path = os.path.join('..', 'src')
if modules_path not in sys.path: sys.path.insert(0, modules_path)

from pprint import pp, pformat
from IPython import display as ds
from forgather import Project
import forgather.nb.notebooks as nb
from forgather.yaml_encoder import to_yaml
from forgather.nb.notebooks import get_train_cmdline, make_train_script

# Load the project
proj = Project(config_template, projects_directory, test_model="models/" + model_config_template)

# Show project info
md = ""
md += nb.render_project_readme(proj.project_dir)
md += nb.render_meta(proj.meta, "### Meta Config\n")
md += nb.render_template_list(proj.meta.find_templates(proj.meta.config_prefix), "### Available Configurations\n")

# Only construct the meta object
config_meta = proj.config.meta()
md += f"### {config_meta['config_name']}:\n\n"
md += nb.render_codeblock("python", pformat(config_meta))
md += nb.render_codeblock("yaml", proj.pp_config, "### Preprocessed Configuration\n")
md += nb.render_codeblock("yaml", to_yaml(proj.config), "### Loaded Configuration\n")

# Show generated model code, if any.
generated_code_node = proj.config['generated_model_code']

if generated_code_node is not None:
    generated_source = generated_code_node()
    md += nb.render_codeblock("python", generated_code_node(), "### Generated Model Code\n")
display(ds.Markdown(md))

## Instantiate New Model

In [None]:
main_output = proj()
model = main_output['model']
tokenizer = main_output['tokenizer']

def show_parameters(model):
    total_parameters = sum(t.numel() for t in model.parameters())
    trainable_parameters = sum(
        t.numel() if t.requires_grad else 0 for t in model.parameters()
    )
    num_params = lambda x: f"{x/1000000:.1f}M"
    print("Total Parameters: ", num_params(total_parameters))
    print("Trainable Parameters: ", num_params(trainable_parameters))

show_parameters(model)
pp(model)

## Test Forward Method
This implements a simple forward and backward pass through the model as to provide a "kick-test," to make sure it does not fall over. If it passes, it does not mean than it is correct, just that it can plausibly be used in a training loop without immediatly rasing an exception.

In [None]:
import torch
torch.autograd.set_detect_anomaly(True)

# Verify model does not fall over when given input.
def test_model_forward(model, batch_size, seq_len, pad_probability, ignore_label=-100, device="cpu", dtype=None):
    if dtype is not None:
        model = model.to(dtype)
    model = model.to(device)
    model.train()
    opt = torch.optim.AdamW(model.parameters())
    
    opt.zero_grad()
    #input_ids = torch.arange(1, batch_size * seq_len + 1, dtype=torch.long).view(batch_size, seq_len)
    input_ids = torch.randint(1, model.config.vocab_size, (batch_size, seq_len), dtype=torch.long, device=device)

    # Generate fake padding mask
    pad_mask = torch.full(input_ids.shape, pad_probability, dtype=torch.float, device=device).bernoulli().to(dtype=torch.long, device=device)
    pad_mask = pad_mask.sort(-1, descending=True)[0]
    print("mask\n", pad_mask)

    # Replace pad values with pad_id and ignore_label
    labels = input_ids.masked_fill(~pad_mask.to(dtype=torch.bool), ignore_label)
    input_ids = input_ids.masked_fill(~pad_mask.to(dtype=torch.bool), model.config.pad_token_id)
    print("input_ids\n", input_ids)
    print("labels\n", labels)

    input_ids = input_ids
    pad_mask = pad_mask
    labels = labels
    outputs = model(input_ids=input_ids, attention_mask=pad_mask, labels=labels, return_dict=True)
    loss = outputs["loss"]
    logits = outputs["logits"]
    print("logits.shape:", logits.shape)
    print("loss:", loss)
    
    # Make sure backward pass works.
    print("Computing loss.backward()...")
    loss.backward()

    print("Unused Parameters:")
    for name, param in model.named_parameters():
        if param.grad is None:
            print(name)

    print("Performing optimizertor step...")
    opt.step()
    print("Done! Congratulations, your model passed the kick-test!")

test_model_forward(model, batch_size=2, seq_len=7, pad_probability=0.9, device="cpu", dtype=None)

## Torch Compile [optional]
Apply torch-compile to the model.

When used, the first pass through model forward will take much longer than normal.

In [None]:
model.compile()

## Code Profiling

In [None]:
# https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
# https://pytorch.org/docs/stable/torch_cuda_memory.html
from torch.profiler import profile, record_function, ProfilerActivity

def print_prof_averages(prof, sort_by, row_limit):
        for i in sort_by:
            print(prof.key_averages().table(sort_by=i, row_limit=row_limit))

def profile_train(model, batch_size, seq_len, use_cpu=True, dtype=None, row_limit=10):
    if use_cpu:
        device = "cpu"
        prof_activity = ProfilerActivity.CPU
        sort_by = [ "cpu_time_total" ]
    else:
        device = "cuda"
        prof_activity = ProfilerActivity.CUDA
        sort_by = [ "cpu_time_total", "cuda_time_total" ]

    model = model.to(device=device, dtype=dtype)
    model.train()
    
    input_ids = torch.randint(1, model.config.vocab_size, (batch_size, seq_len), dtype=torch.long, device=device)
    labels = input_ids
    print("Running Forward Pass")
    with profile(activities=[prof_activity], record_shapes=True, profile_memory=True) as prof:
        with record_function("model_forward"):
            loss, logits = model(input_ids=input_ids, labels=labels)

    print_prof_averages(prof, sort_by, row_limit)

    print("Running Backward Pass")
    with profile(activities=[prof_activity], record_shapes=True) as prof:
        with record_function("loss_backward"):
            loss.backward()
    print_prof_averages(prof, sort_by, row_limit)

def profile_inference(model, batch_size, seq_len, use_cpu=True, dtype=None, row_limit=10):
    if use_cpu:
        device = "cpu"
        prof_activity = ProfilerActivity.CPU
        sort_by = [ "cpu_time_total" ]
    else:
        device = "cuda"
        prof_activity = ProfilerActivity.CUDA
        sort_by = [ "cpu_time_total", "cuda_time_total" ]
        torch.cuda.memory._record_memory_history()

    model = model.to(device=device, dtype=dtype)
    model.eval()
    
    input_ids = torch.randint(1, model.config.vocab_size, (batch_size, seq_len), dtype=torch.long, device=device)

    with torch.inference_mode():
        with profile(activities=[prof_activity], record_shapes=True, profile_memory=True) as prof:
            with record_function("model_forward"):
                loss = model(input_ids=input_ids)
    
    print_prof_averages(prof, sort_by, row_limit)
    if not use_cpu:
        torch.cuda.memory._dump_snapshot("my_snapshot.pickle")


### CPU

In [None]:
profile_train(model, 16, 512, use_cpu=True)

### GPU

In [None]:
profile_train(model, 16, 512, use_cpu=False)