In [1]:
import sys
import os
from accelerate import notebook_launcher
from datasets import load_dataset
import logging

# Configure logging:
# (i) lear any pre-existing logging handlers (especially useful in notebook reruns)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
# (ii) now safely configure logging
logging.getLogger("codecarbon").setLevel(logging.ERROR)
logging.basicConfig(level=logging.INFO)

# Adjust paths -> import classes
project_root = os.getcwd()  
if project_root not in sys.path:
    sys.path.append(project_root)
from classes.experiment_config import ExperimentConfig
from classes.experiment_runner import ExperimentRunner

#import torch.multiprocessing as mp
#mp.set_start_method('spawn', force=True)

In [None]:
experiment_config = ExperimentConfig(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    is_encoder_decoder=False,
    task_type="text_generation",
    inference_type="pure_generative",  
    max_input_tokens=512,
    max_output_tokens=512,
    num_input_prompts=100,
    save_outputs=True,
    decode_token_to_text=True,
    gpu_list=[0,1,2,3],
    num_processes=4,
    batching_options={
        "fixed_max_batch_size": 8, # this is max batch size if adaptive batching on; fixed batch size if it's off
        "adaptive_batching": False,
        "adaptive_max_tokens": 512  
    },
    sharding_config={
        "fsdp_config": {
            "use_orig_params": True,
            "cpu_offload": True
        },
        "sharding_strategy": "NO_SHARD"
    },
    query_rate=1,
    decoder_temperature=1.0, # NB: needs to be a float
    fp_precision="float16",
    backend="pytorch"
)

# Load prompts (here, using the 'arxiv' split from the lighteval/pile_helm dataset)
ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
prompts = [sample["text"] for sample in ds]

# function to run the whole experiment workflow.
def run_experiment_workflow():
    # run experiment
    runner = ExperimentRunner(experiment_config, prompts)
    runner.run_torch()
    
    # If not main process, exit cleanly after run
    if not runner.accelerator.is_main_process:
        return  # do NOT proceed further

    # Only main process continues
    if runner.accelerator.is_main_process:
         # aggregate results
        runner.aggregate_results()
        # save results
        runner.save_experiment_results()  
        
    # could add in script here to delete JSON files if storage an issue
    
    runner.teardown()

    return runner

# Launch the experiment across the specified number of processes.
notebook_launcher(run_experiment_workflow,
                    num_processes=experiment_config.num_processes) 
                    #terminate_on_error=True) # USE THIS IN accelerate.launch.launch()                 

Launching training on 4 GPUs.
Accelerator set up
Unique experiment id: 0186
TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded using pytorch, with precision float16
Original generate method saved.
Model and tokenizer prepared
Entering wait barrier: after model preparation
Exiting wait barrier: after model preparation

INFO:classes.experiment_runner:[Process 2626993] Model is on device: cuda:3





INFO:classes.experiment_runner:[Process 2626992] Model is on device: cuda:2
INFO:classes.experiment_runner:[Process 2626991] Model is on device: cuda:1
INFO:classes.experiment_runner:[Process 2626990] Model is on device: cuda:0


Entering wait barrier: after logging device info
Exiting wait barrier: after logging device info
Original generate method reassigned


INFO:classes.experiment_runner:[Process 2626992] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 2626991] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 2626993] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 2626990] Dummy forward pass complete


Entering wait barrier: after dummy forward pass
Exiting wait barrier: after dummy forward pass
Prompts processed: 50 prompts.
Energy tracking started
Task type: pure_generative
Using fixed batching (non-adaptive): created 7 batches.


INFO:_6_run_inference_by_task:[Process 2626991][GPU 1] — Completed tokenisation of batch 1/7
INFO:_6_run_inference_by_task:[Process 2626992][GPU 2] — Completed tokenisation of batch 1/7
INFO:_6_run_inference_by_task:[Process 2626990][GPU 0] — Completed tokenisation of batch 1/7
INFO:_6_run_inference_by_task:[Process 2626993][GPU 3] — Completed tokenisation of batch 1/7
INFO:_6_run_inference_by_task:[Process 2626991][GPU 1] — Completed tokenisation of batch 2/7
INFO:_6_run_inference_by_task:[Process 2626993][GPU 3] — Completed tokenisation of batch 2/7
INFO:_6_run_inference_by_task:[Process 2626992][GPU 2] — Completed tokenisation of batch 2/7
INFO:_6_run_inference_by_task:[Process 2626990][GPU 0] — Completed tokenisation of batch 2/7
INFO:_6_run_inference_by_task:[Process 2626991][GPU 1] — Completed tokenisation of batch 3/7
INFO:_6_run_inference_by_task:[Process 2626993][GPU 3] — Completed tokenisation of batch 3/7
INFO:_6_run_inference_by_task:[Process 2626992][GPU 2] — Completed tok

Decoded token outputs successfully.
Energy tracking stopped
Entering wait barrier: after energy tracking stop


INFO:classes.experiment_runner:[Process 2626992][GPU 2]: Inference complete


Exiting wait barrier: after energy tracking stop
Saved outputs
[DEBUG] Enter get_experiment_setup: Experiment ID: 0186
[DEBUG] Exiting get_experiment_setup with result: {'experiment_id': '0186', 'date_time': 'March 25, 2025 at 05:30:57 PM', 'model': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'is_encoder_decoder': False, 'task_type': 'text_generation', 'available_gpu_count': 4, 'gpu_model': '4 x NVIDIA A100-PCIE-40GB', 'available_cpu_count': 128, 'cpu_model': 'AMD EPYC 7742 64-Core Processor', 'os': 'Linux-5.15.0-113-generic-x86_64-with-glibc2.31', 'python_version': '3.10.14 (main, Apr  6 2024, 18:45:05) [GCC 9.4.0]', 'country': 'Germany', 'region': 'saxony'}
[DEBUG] Enter get_experimental_variables: Accelerator index: 0
[DEBUG] Exiting get_experimental_variables with result: {'max_input_tokens': 256, 'max_output_tokens': 128, 'number_input_prompts': 50, 'decode_token_to_text': True, 'decoder_temperature': 1.0, 'query_rate': 1, 'fp_precision': 'torch.float16', 'quantisation': None, 'batching

INFO:classes.experiment_runner:Main process saved (i) experiment setup, (ii) variables, (iii) model architecture.


Experiment-wide meta info saved
[DEBUG] Enter combine_inference_metrics: Accelerator index: 0
[DEBUG] Exiting combine_inference_metrics with result: {'number_input_prompts': 50, 'total_input_tokens': 12800, 'total_generated_tokens': 6400} & {'total_inference_time_sec': 24.916690498008393, 'average_latency_ms_per_batch': 3559.527214001199, 'throughput_queries_per_sec': 2.006687043931317, 'throughput_tokens_per_sec': 256.8559416232086}
[DEBUG] Enter combine_comp_metrics: Accelerator index: 0
[DEBUG] All samples have length 256. Computing FLOPs for one sample.
[DEBUG] Computed FLOPs for one sample: 264843296768
[DEBUG] Exiting combine_comp_metrics with result: flops = 13242164838400; memory = {'gpu_current_memory_allocated_bytes': 4419744768, 'gpu_max_memory_allocated_bytes': 4419744768, 'gpu_current_memory_reserved_bytes': 6933184512, 'gpu_max_memory_reserved_bytes': 6933184512}; compute_util = {'gpu_utilization_percent': [68.0, 100.0, 100.0, 100.0], 'cpu_usage_percent': 5.8, 'cpu_memory

INFO:classes.experiment_runner:Main process saved inference and computation metrics.


Experiment-wide inference and compute metrics saved
Entering wait barrier: after saving experiment metrics
Exiting wait barrier: after saving experiment metrics[DEBUG] Enter combine_energy_metrics: Process ID: 2626992, Local process index: 2[DEBUG] Enter combine_energy_metrics: Process ID: 2626991, Local process index: 1[DEBUG] Enter combine_energy_metrics: Process ID: 2626993, Local process index: 3



[DEBUG] Enter combine_energy_metrics: Process ID: 2626990, Local process index: 0[DEBUG] Energy consumed: 0.006255023750990785 kWh, which equals 22518.08550356683 joules.[DEBUG] Energy consumed: 0.005931830432803333 kWh, which equals 21354.589558092 joules.[DEBUG] Energy consumed: 0.005948274421640506 kWh, which equals 21413.78791790582 joules.



[DEBUG] Exiting combine_energy_metrics with result: {'process_id': 2626992, 'local_process_index': 2, 'energy_results': {'cpu_power': 112.5, 'gpu_power': 645.6149966768064, 'ram_power': 3.8208532333374023, 'cpu_energy': 0.0007940554781780522, 

INFO:classes.experiment_runner:Process 2: Energy metrics combined successfully.
INFO:classes.experiment_runner:Process 1: Energy metrics combined successfully.


[DEBUG] Exiting combine_energy_metrics with result: {'process_id': 2626990, 'local_process_index': 0, 'energy_results': {'cpu_power': 112.5, 'gpu_power': 737.0377680446375, 'ram_power': 3.819994926452637, 'cpu_energy': 0.0007791859869066685, 'gpu_energy': 0.005375635689394365, 'ram_energy': 2.166656364000457e-05, 'total_energy_kwh': 0.006176488239941039, 'total_energy_joules': 22235.35766378774, 'final_emissions': 0.002352933195005539}}

INFO:classes.experiment_runner:[Process 2626992][GPU 2] saved its energy metrics.
INFO:classes.experiment_runner:Process 3: Energy metrics combined successfully.
INFO:classes.experiment_runner:[Process 2626991][GPU 1] saved its energy metrics.





INFO:classes.experiment_runner:[Process 2626993][GPU 3] saved its energy metrics.
INFO:classes.experiment_runner:Process 0: Energy metrics combined successfully.
INFO:classes.experiment_runner:[Process 2626990][GPU 0] saved its energy metrics.


All local process energy metrics saved
Experiment finished
Destroyed process group successfully


INFO:classes.experiment_runner:Aggregating per-process energy metrics from disk.
INFO:classes.experiment_runner:Aggregated global energy results successfully from disk.
INFO:classes.experiment_runner:Saving experiment results
INFO:classes.experiment_runner:Experiment results saved to results/text_generation_results.json


Starting teardown process...
Process group not initialized.
Emptying CUDA cache...
Running garbage collection...
Teardown process complete.
