In [None]:
import sys
import os
from accelerate import notebook_launcher
from datasets import load_dataset
import logging

# Configure logging:
# (i) lear any pre-existing logging handlers (especially useful in notebook reruns)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
# (ii) now safely configure logging
logging.getLogger("codecarbon").setLevel(logging.ERROR)
logging.basicConfig(level=logging.INFO)

# Adjust paths -> import classes
project_root = os.getcwd()  
if project_root not in sys.path:
    sys.path.append(project_root)
from classes.experiment_config import ExperimentConfig
from classes.experiment_runner import ExperimentRunner

#import torch.multiprocessing as mp
#mp.set_start_method('spawn', force=True)


In [2]:
if __name__ == "__main__":
    experiment_config = ExperimentConfig(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        is_encoder_decoder="decoder_only",
        task_type="text_generation",
        inference_type="pure_generative",  
        max_input_tokens=512,
        max_output_tokens=50,
        num_input_prompts=5,
        gpu_list=[0, 1],
        num_processes=2,
        batching_options={
            "fixed_max_batch_size": 2, # this is max batch size if adaptive batching on; fixed batch size if it's off
            "adaptive_batching": False,
            "adaptive_max_tokens": 256  
        },
        sharding_config={
            "fsdp_config": {
                "use_orig_params": True,
                "cpu_offload": True
            },
            "sharding_strategy": "NO_SHARD"
        },
        query_rate=1,
        decoder_temperature=1,
        fp_precision="float16",
        backend="pytorch"
    )

    # Load prompts (here, using the 'arxiv' split from the lighteval/pile_helm dataset)
    ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
    prompts = [sample["text"] for sample in ds]

    # Define a function to run the experiment.
    def run_experiment():
        runner = ExperimentRunner(experiment_config, prompts)
        runner.run_torch()

    # Launch the experiment across the specified number of processes.
    notebook_launcher(run_experiment,
                      num_processes=experiment_config.num_processes) 
                      #terminate_on_error=True) # USE THIS IN accelerate.launch.launch()

Launching training on 2 GPUs.
Accelerator set up
Unique experiment id: 0085
TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded using pytorch, with precision float16
Original generate method saved.
Model and tokenizer prepared


INFO:classes.experiment_runner:[Process 1470026] Model is on device: cuda:1
INFO:classes.experiment_runner:[Process 1470025] Model is on device: cuda:0


Original generate method reassigned


INFO:classes.experiment_runner:[Process 1470026] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 1470025] Dummy forward pass complete


Prompts processed: 5 prompts.








Energy tracking started
Task type: pure_generative
Using fixed batching (non-adaptive): created 3 batches.


INFO:classes.experiment_runner:[Process 1470026] Inference complete
INFO:classes.experiment_runner:[Process 1470025] Inference complete


Energy tracking stopped


INFO:classes.experiment_runner:Process 1 saved its energy metrics.
INFO:classes.experiment_runner:Main process saved inference and computation metrics.


Experiment-wide inference and compute metrics saved


INFO:classes.experiment_runner:Process 0 saved its energy metrics.


All local process energy metrics saved
Experiment finished
