# Configuration Notebook
Useful for debugging configurations and viewing project configuration details.

## Setup
Configure defaults and select a project.

In [2]:
# Set defaults
default_projects_directory = '../examples/trainers'
default_project = "compare_trainers"

# Use default, if empty.
config_template = ""

from ipyfilechooser import FileChooser
import os
fc = FileChooser(
    os.path.join(default_projects_directory, default_project), show_only_dirs=True,
    title="Select a Project Directory", select_default=True)
display(fc)

FileChooser(path='/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers', filename='', title='Se…

## Project Info

In [3]:
import sys, os
modules_path = os.path.join('..', 'src')
if modules_path not in sys.path: sys.path.insert(0, modules_path)
from pprint import pformat, pp
from IPython import display as ds
from forgather.config import (
    ConfigEnvironment,
    pconfig
)
from aiws.config import preprocessor_globals, MetaConfig
import aiws.notebooks as nb

assert os.path.exists(fc.selected_path), "Project directory does not exist."
nb.show_project_readme(fc.selected_path)
meta = MetaConfig(fc.selected_path)
nb.display_meta(meta, "### Meta Config\n")
nb.list_templates(meta.find_templates(meta.config_prefix), "### Available Configurations\n")
default_config = meta.default_config()
print('-' * 60)
print(f"Default Configuration: {default_config}")

### Meta Config
Project Directory: ../examples/trainers/compare_trainers

Meta Config: [../examples/trainers/compare_trainers/meta.yaml](../examples/trainers/compare_trainers/meta.yaml)

Template Search Paths:
- [../examples/trainers/compare_trainers/templates](../examples/trainers/compare_trainers/templates)
- [../templates](../templates)


### Available Configurations
- [accel_trainer.yaml](../examples/trainers/compare_trainers/templates/experiments/accel_trainer.yaml)
- [trainer.yaml](../examples/trainers/compare_trainers/templates/experiments/trainer.yaml)
- [hf_trainer.yaml](../examples/trainers/compare_trainers/templates/experiments/hf_trainer.yaml)


------------------------------------------------------------
Default Configuration: trainer.yaml


## List Available Templates
This will list all templates within the searchpath.

In [None]:
def list_templates(prefix):
    nb.list_templates(meta.find_templates(prefix), "### Templates\n")
list_templates('')

In [None]:
meta.config_path(config_template)

## Configuration
Set 'config_template' to the name of a configuration template in the project. See: "Available Configurations"
If unspecified, the first configuration in the project will be used.

In [4]:
# Create configuration envrionment
environment = ConfigEnvironment(
    searchpath=meta.searchpath,
    globals = preprocessor_globals(fc.selected_path),
)

config_template_path = meta.config_path(config_template)
nb.display_preprocessed_template(environment, config_template_path, title="### Preprocessed Configuration\n")
nb.display_referenced_templates_tree(environment, config_template_path, "### Included Templates\n")
config, pp_config = environment.load(config_template_path).get()
nb.display_referenced_source_list(config, "### Included Sources\n")
display(ds.Markdown("### Loaded Configuration\n"))
pconfig(config)

### Preprocessed Configuration
```yaml
# Just use the tiny project defaults.
#---------------------------------------
#             Default Trainer            
#---------------------------------------
# 2024-07-24 04:54:13
# Description: Train with aiws.trainer.Trainer implementation
# Project Dir: /home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers
# Current Working Dir: "/home/dinalt/ai_assets/forgather/notebooks"
# Forgather Config Dir: "/home/dinalt/.config/forgather"
# Model: default_model
# Hostname: hal9000
# Versions:
#     python: 3.10.13
#     torch: 2.3.1
#     transformers: 4.41.2
#     accelerate: 0.31.0

############# Config Vars ##############

# ns.models_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models"
# ns.tokenizers_dir: "/home/dinalt/ai_assets/forgather/tokenizers"
# ns.datasets_dir: "/home/dinalt/ai_assets/forgather/datasets"
# ns.model_src_dir: "/home/dinalt/ai_assets/forgather/model_src"
# ns.output_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model"
# ns.logging_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model/runs/default_trainer_1721796853009181271"
# ns.create_new_model: True
# ns.save_model: False
# ns.train: True
# ns.eval: False

####### Distributed Environment ########

.define: &distributed_env !callable:aiws.distributed:DistributedEnvironment

############# Dependencies #############



################ Model #################

# https://huggingface.co/docs/transformers/en/model_doc/auto
.define: &model_constructor_args {}

# Name: Tiny
# Description: A Plain Vanilla Transformer.
# model_def.cls = "VanillaTransformer"
# model_def.cfg_cls = "VanillaTransformerConfig"
# model_def.config_path = "/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py"
# model_def.model_path = "/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py"

# **Tokenizer**

# Load custom tokenizer from sub-project definition
.define: &tokenizer !callable:aiws.construct:load_from_config
    project_dir: "/home/dinalt/ai_assets/forgather/examples/tokenizers/tiny_stories_bpe"
    config_template: "2k.yaml"

# **Model Config**

.define: &model_config
    vocab_size: !callable:forgather.construct:length [ *tokenizer ]
    max_sequence_length: !callable:forgather.construct:get_attr [ *tokenizer, 'model_max_length' ]
    hidden_size: 256
    dim_feedforward: 1024
    num_attention_heads: 2
    num_hidden_layers: 4

# **Model Constructor**

# Custom transformer model; registers for AutoClass and will save code with weights.
.define: &model !callable:aiws.construct:register_for_auto_class
    - !callable:/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py:VanillaTransformer
        args:
            - !callable:aiws.construct:register_for_auto_class
                - !callable:/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py:VanillaTransformerConfig
                    <<: *model_config
        kwargs:
            <<: *model_constructor_args

############### Datasets ###############

# Name: TinyStories Abridged
# Define: Abridged to 10% of original size; Dataset containing synthetically generated (by GPT-3.5 and GPT-4) short stories that only use a small vocabulary.
# Source: https://arxiv.org/abs/2305.07759
# Train Dataset: "roneneldan/TinyStories" : "train"
# Eval Dataset: "roneneldan/TinyStories" : "validation"

# **Source Datasets**

.define: &train_source_dataset !callable:datasets:load_dataset
    args:
        - "roneneldan/TinyStories"

.define: &eval_source_dataset !callable:datasets:load_dataset
    args:
        - "roneneldan/TinyStories"

# **Dataset Splits**

.define: &train_dataset_split !callable:forgather.construct:get_item
    - *train_source_dataset
    - "train"

.define: &eval_dataset_split !callable:forgather.construct:get_item
    - *train_source_dataset
    - "validation"

# **Tokenize Args**

.define: &tokenize_args
    truncation: True

# **Tokenized Datasets**

.define: &train_dataset !callable:aiws.datasets:tokenize_dataset
    dataset: *train_dataset_split
    tokenizer: *tokenizer
    select_range: 0.1
    desc: "Tokenizing train"
    fn_kwargs:
        <<: *tokenize_args

.define: &eval_dataset !callable:aiws.datasets:tokenize_dataset
    dataset: *eval_dataset_split
    tokenizer: *tokenizer
    select_range: 500
    desc: "Tokenizing validation split"
    fn_kwargs:
        <<: *tokenize_args

############ Data Collator #############

# Data collator for causal model
# Batches are dynamically padded to longest sequence
# labels are set to input_ids, with pad tokens set to -100
# https://huggingface.co/docs/transformers/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
.define: &data_collator !callable:transformers:DataCollatorForLanguageModeling
    args:
        - *tokenizer
    kwargs:
        mlm: False
        return_tensors: pt

########## Trainer Callbacks ###########

# **Dependencies**

# Experiment tracking: Tensorboard SummaryWriter
.define: &summary_writer !callable:torch.utils.tensorboard:SummaryWriter
    - "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model/runs/default_trainer_1721796853009181271"

# Additional data to record to experiment loggers
.define: &experiment_info
    date: "2024-07-24 04:54:13"
    name: "Default Trainer"
    description: "Train with aiws.trainer.Trainer implementation"
    config: !callable:pp_config
    versions: {'python': '3.10.13', 'torch': '2.3.1', 'transformers': '4.41.2', 'accelerate': '0.31.0'}

# **Callback List**

.define: &trainer_callbacks
    - !callable:aiws.json_logger:JsonLogger
        <<: *experiment_info
    # Log configuration and metrics to Tensorboard file
    - !callable:aiws.tb_logger:TBLogger
        args: [ *summary_writer ]
        kwargs:
            <<: *experiment_info
############### Trainer ################

# Name: Custom aiws.trainer.Trainer
# Description: A lightweight, extensible trainer; does not support multiple GPUs

# **Trainer Args**

.define: &trainer_args
    # Base Trainer Defaults
    # https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments
    output_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model"
    logging_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model/runs/default_trainer_1721796853009181271"
    overwrite_output_dir: True
    per_device_train_batch_size: 16
    per_device_eval_batch_size: 16
    learning_rate: 1.0e-3
    num_train_epochs: 1
    eval_steps: 500
    logging_steps: 500
    eval_strategy: "steps"
    save_strategy: "no"
    logging_strategy: "steps"
    lr_scheduler_type: "constant"

    # Tiny Project Overrides
    per_device_train_batch_size: 64
    per_device_eval_batch_size: 64
    logging_steps: 100
    eval_steps: 500
    dataloader_num_workers: 1

# **Trainer Constructor**

.define: &trainer !callable:aiws.trainer:Trainer
    model: *model
    args: !callable:aiws.trainer_types:TrainingArguments
        <<: *trainer_args
    data_collator: *data_collator
    train_dataset: *train_dataset
    eval_dataset: *eval_dataset
    tokenizer: *tokenizer
    callbacks: *trainer_callbacks

#---------------------------------------
#          Configuration Output          
#---------------------------------------
meta: &meta_output
    config_name: "Default Trainer"
    config_description: "Train with aiws.trainer.Trainer implementation"
    project_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers"
    models_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models"
    tokenizers_dir: "/home/dinalt/ai_assets/forgather/tokenizers"
    datasets_dir: "/home/dinalt/ai_assets/forgather/datasets"
    output_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model"
    model_zoo_dir: "/home/dinalt/ai_assets/forgather/model_src"
    logging_dir: "/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model/runs/default_trainer_1721796853009181271"

main: !callable:aiws.training_script:TrainingScript
    meta: *meta_output
    do_save: False
    do_train: True
    do_eval: False
    distributed_env: *distributed_env
    trainer: *trainer
```


### Included Templates
- [experiments/trainer.yaml](../examples/trainers/compare_trainers/templates/experiments/trainer.yaml)
    - [project.yaml](../examples/trainers/compare_trainers/templates/project.yaml)
        - [types/training_script/causal_lm/causal_lm.yaml](../templates/types/training_script/causal_lm/causal_lm.yaml)
            - [types/training_script/training_script.yaml](../templates/types/training_script/training_script.yaml)
                - [types/type.yaml](../templates/types/type.yaml)
                    - [inc/formatting.jinja](../templates/inc/formatting.jinja)
                - [inc/formatting.jinja](../templates/inc/formatting.jinja)
            - [inc/formatting.jinja](../templates/inc/formatting.jinja)
            - [models/abstract/load_model.yaml](../templates/models/abstract/load_model.yaml)
                - [models/abstract/causal_lm_from_pretrained.yaml](../templates/models/abstract/causal_lm_from_pretrained.yaml)
                    - [models/abstract/base_language_model.yaml](../templates/models/abstract/base_language_model.yaml)
                        - [inc/formatting.jinja](../templates/inc/formatting.jinja)
            - [callbacks/loggers.yaml](../templates/callbacks/loggers.yaml)
                - [callbacks/base_callbacks.yaml](../templates/callbacks/base_callbacks.yaml)
                    - [inc/formatting.jinja](../templates/inc/formatting.jinja)
            - [trainers/trainer.yaml](../templates/trainers/trainer.yaml)
                - [trainers/base_trainer.yaml](../templates/trainers/base_trainer.yaml)
                    - [inc/formatting.jinja](../templates/inc/formatting.jinja)
        - [paths/default_paths.yaml](../templates/paths/default_paths.yaml)
        - [datasets/tiny/tiny_stories_abridged.yaml](../templates/datasets/tiny/tiny_stories_abridged.yaml)
            - [datasets/tiny/tiny_stories.yaml](../templates/datasets/tiny/tiny_stories.yaml)
                - [datasets/abstract/base_datasets.yaml](../templates/datasets/abstract/base_datasets.yaml)
                    - [inc/formatting.jinja](../templates/inc/formatting.jinja)
        - [project.trainer_config](../examples/trainers/compare_trainers/templates/project.yaml)
        - [project.model_config](../examples/trainers/compare_trainers/templates/project.yaml)
            - [models/tiny/tiny.yaml](../templates/models/tiny/tiny.yaml)
                - [models/vanilla_transformer.yaml](../templates/models/vanilla_transformer.yaml)
                    - [models/abstract/custom_causal_lm.yaml](../templates/models/abstract/custom_causal_lm.yaml)
                        - [models/abstract/base_language_model.yaml](../templates/models/abstract/base_language_model.yaml)
                            - [inc/formatting.jinja](../templates/inc/formatting.jinja)
                - [tokenizers/tiny_2k.yaml](../templates/tokenizers/tiny_2k.yaml)
        - [callbacks/loggers.yaml](../templates/callbacks/loggers.yaml)
            - [callbacks/base_callbacks.yaml](../templates/callbacks/base_callbacks.yaml)
                - [inc/formatting.jinja](../templates/inc/formatting.jinja)


### Included Sources
- [/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py](/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py) : VanillaTransformer
- [/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py](/home/dinalt/ai_assets/forgather/model_src/vanilla_transformer/vanilla_transformer.py) : VanillaTransformerConfig


### Loaded Configuration


main:
  Latent &139808407668832 'aiws.training_script:TrainingScript'
    distributed_env: Latent &139808407738848 'aiws.distributed:DistributedEnvironment'
    do_eval: False
    do_save: False
    do_train: True
    meta:
      config_description: 'Train with aiws.trainer.Trainer implementation'
      config_name: 'Default Trainer'
      datasets_dir: '/home/dinalt/ai_assets/forgather/datasets'
      logging_dir:
        '/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model/runs/default_trainer_1721796853082838051'
      model_zoo_dir: '/home/dinalt/ai_assets/forgather/model_src'
      models_dir:
        '/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models'
      output_dir:
        '/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers/output_models/default_model'
      project_dir: '/home/dinalt/ai_assets/forgather/examples/trainers/compare_trainers'
      tokenizers_dir: '/home/dinalt/ai_assets

## Materialized Configuration

Instantiate the configuration from the definition.

In [None]:
config, pp_config = environment.load(config_template_path).get()

# Note: We inject the pre-processed config as an argument, which can then be used to log this information.
main_output = config.main(pp_config=pp_config)
pconfig(main_output)

### Run Configuration

Assuming that this the output object has a 'run' method (training scripts do), the following will run it.

For a more robust approach, see: [train.ipynb](train.ipynb)

In [None]:
main_output.run()

### Cleanup
Note: These will show the target directory and ask for confirmation before proceeding.

#### Delete All

In [None]:
nb.delete_dir(config.meta['models_dir'] "Delete all models in project")

#### Delete Configuration Output Directory
This will delete the model and logs for the current configuration.

In [None]:
nb.delete_dir(config.meta['output_dir'], "Delete output directory")