# Project Index

In [1]:
import forgather.nb.notebooks as nb
nb.display_project_index()

## Activation Checkpoint Test

Testing activation checkpointing.

#### Project Directory: "/home/dinalt/ai_assets/forgather/examples/trainers/activation_checkpoint"

## Meta Config
Meta Config: [/home/dinalt/ai_assets/forgather/examples/trainers/activation_checkpoint/meta.yaml](meta.yaml)

- [meta.yaml](meta.yaml)
    - [meta_defaults.yaml](../../../forgather_workspace/meta_defaults.yaml)
        - [base_directories.yaml](../../../forgather_workspace/base_directories.yaml)

Template Search Paths:
- [/home/dinalt/ai_assets/forgather/examples/trainers/activation_checkpoint/templates](templates)
- [/home/dinalt/ai_assets/forgather/forgather_workspace](../../../forgather_workspace)
- [/home/dinalt/ai_assets/forgather/templates/tiny_experiments](../../../templates/tiny_experiments)
- [/home/dinalt/ai_assets/forgather/templates/modellib](../../../templates/modellib)
- [/home/dinalt/ai_assets/forgather/templates/base](../../../templates/base)

## Available Configurations
- [test_cp.yaml](templates/experiments/test_cp.yaml)
- [control.yaml](templates/experiments/control.yaml)

Default Configuration: control.yaml



In [None]:
nb.display_config(config_template="test_cp.yaml", show_pp_config=False, show_generated_code=False)

#### View Memory Snapshot Files Here

https://docs.pytorch.org/memory_viz

In [None]:
from forgather.project import Project
from forgather.ml.utils import count_parameters

# Load project assests
proj = Project("test_cp.yaml")
model_f, data_collator, train_dataset, optimizer = proj("model", "data_collator", "train_dataset", "optimizer")

model = model_f()
print(model)
print(count_parameters(model))

In [None]:
from torch.utils.data import DataLoader
from pickle import dump
import torch
from pickle import dump
import forgather
from functools import partial
from tqdm.auto import tqdm
from forgather.ml.optim.adafactor import Adafactor
from pprint import pp

# Substitute with the project optimizer to test memory requirements with these.
adamw_factory = partial(torch.optim.AdamW, lr=1e-3)
sgd_factory = partial(torch.optim.SGD, lr=1e-1)
adafactor_factory = partial(Adafactor, lr=1e-4)

def profile_training_memory(
    model,
    dataset,
    data_collator,
    opt_factory,
    device,
    batch_size,
    max_steps,
    truncate_to=None,
    shuffle=False,
    show_details=False,
    profiler_file=None
):
    try:
        train_progress_bar = tqdm(total=max_steps, dynamic_ncols=True)
        
        if profiler_file:
            torch.cuda.memory._record_memory_history(enabled='all')
            
        model.train()
        model.to(device)
        if shuffle:
            dataset = dataset.shuffle()
        
        dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator, pin_memory=True)
        opt = opt_factory(model.named_parameters())
        max_sequence = 0
        for step, batch in enumerate(dataloader):
            labels = batch["labels"]
            input_ids = batch["input_ids"]

            if truncate_to:
                labels = labels[:, :truncate_to]
                input_ids = labels[:, :truncate_to]
            
            if max_sequence < input_ids.shape[1]:
                max_sequence = input_ids.shape[1]

            input_ids = input_ids.to(device)
            labels = labels.to(device)
    
            try:
                loss, logits = model(input_ids=input_ids, labels=labels)
                loss.backward()
                opt.step()
            except:
                print(f"Exception raised on batch {step} of {max_steps} : {input_ids.shape}")
                raise
            opt.zero_grad()
            train_progress_bar.update()
            train_progress_bar.write(f"loss: {loss.item():.4}")
            if step == max_steps:
                break
        del opt
        
        # save a snapshot of the memory allocation to file
        if profiler_file:
            s = torch.cuda.memory._snapshot()
            with open(profiler_file, "wb") as f:
                dump(s, f)
        torch.cuda.memory._record_memory_history(enabled=None)
        max_allocated = torch.cuda.max_memory_allocated()
        model.cpu()
        print(f"maximum_sequence_length={max_sequence}")
        print(f"final loss={loss.item()}")
        print(f"max_allocated={max_allocated / 1000000000.:.3f} GB")
        if show_details:
            pp(torch.cuda.memory_stats(device))
    finally:
        train_progress_bar.close()
        train_progress_bar = None

profile_training_memory(
    model=model,
    dataset=train_dataset,
    data_collator=data_collator,
    opt_factory=adafactor_factory,
    device="cuda:0",
    batch_size=32,
    max_steps=3,
    truncate_to=None,
    shuffle=True,
    show_details=True,
    profiler_file=None,
)

In [None]:
torch.cuda.reset_max_memory_allocated("cuda:0")

In [None]:
from forgather.project import Project

proj = Project("test_cp.yaml")

In [None]:
nb.generate_trainingscript(proj, "1")

In [None]:
nb.display_tb_command(proj, local_host=False)