In [None]:

import sys
import os
from accelerate import notebook_launcher
from datasets import load_dataset
import logging
import torch

# Configure logging:
# (i) lear any pre-existing logging handlers (especially useful in notebook reruns)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
# (ii) now safely configure logging
logging.getLogger("codecarbon").setLevel(logging.ERROR)
logging.basicConfig(level=logging.INFO)

# Adjust paths -> import classes
project_root = os.getcwd()  
if project_root not in sys.path:
    sys.path.append(project_root)
    
from configs.experiment_config import ExperimentConfig
from experiments.experiment_runner import ExperimentRunner

#import torch.multiprocessing as mp
#mp.set_start_method('spawn', force=True)

In [None]:
experiment_config = ExperimentConfig(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    is_encoder_decoder=False,
    task_type="text_generation",
    inference_type="pure_generative",  
    max_input_tokens=500, #2048 is Llama's limit
    max_output_tokens=200,
    num_input_prompts=10,
    save_outputs=True,
    decode_token_to_text=True,
    gpu_list=[0,1,2,3],
    num_processes=4,
    batching_options={
        "batch_size___fixed_batching":16,
        "adaptive_batching": False,
        "adaptive_max_tokens": 3000,
        "max_batch_size___adaptive_batching": 100
    },
    sharding_config={
        "fsdp_config": {
            "use_orig_params": True,
            "cpu_offload": True
        },
        "sharding_strategy": "NO_SHARD" #FULL_SHARD doesn't work
    },
    query_rate=1,
    decoder_temperature=1.0, # if 0: deterministic, if >0 do_sample is activated. NB: needs to be a float.. Higher Temperature → Longer Outputs (sometimes)If stopping criterion is based on tokens (e.g., EOS token), high temperature might delay reaching it.

    fp_precision="float32",
    quantization_config={
        "quantization": False,
        "cached_flops_for_quantised_models": 5172720640000, # THIS IS NEEDED UNTIL I CAN WORK OUT HOW TO COMPUTE FLOPS OF QUANITSED MODELS
        "load_in_8bit": False,
        "load_in_4bit": True,
        #"llm_int8_threshold"=6.0,
        "llm_int8_enable_fp32_cpu_offload": False,
        "llm_int8_has_fp16_weight": False,
    },
    backend="pytorch"
)



In [3]:

# Load prompts (here, using the 'arxiv' split from the lighteval/pile_helm dataset)
ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
prompts = [sample["text"] for sample in ds]

import time
import logging

logger = logging.getLogger(__name__)

def run_experiment_workflow():
    max_retries = 3
    retry_delay = 5  # seconds between retries
    success = False
    
    for attempt in range(max_retries):
        try:
            # Initialize the experiment runner with the configuration and prompts.
            runner = ExperimentRunner(experiment_config, prompts)
            
            # Run the experiment. This call internally uses distributed accelerator,
            # so each process will run it.
            runner.run_torch()
            
            # If not the main process, exit cleanly.
            if not runner.accelerator.is_main_process:
                return
            
            # Only main process continues.
            runner.aggregate_results()
            runner.save_experiment_results()
            
            # Clean up resources.
            runner.teardown()
            
            logger.info(f"Experiment run succeeded on attempt {attempt+1}")
            success = True
            break  # Exit retry loop on success.
        except Exception as e:
            logger.error(f"Experiment run failed on attempt {attempt+1}: {e}", exc_info=True)
            time.sleep(retry_delay)
    
    if not success:
        logger.error("Experiment failed after maximum attempts. Moving on.")
        # Optionally: save a record of the failure so you can re-run later.
    return

# Launch the experiment across the specified number of processes.
notebook_launcher(run_experiment_workflow,
                    num_processes=experiment_config.num_processes)
                    #terminate_on_error=True) # USE THIS IN accelerate.launch.launch()  
              

Launching training on 4 GPUs.
Accelerator set up
Unique experiment id: 0270
TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded using pytorch, with precision float32
Entering wait barrier: after load_model_tokenizer
wait_for_everyone completed within 10 seconds for after load_model_tokenizer.
Exiting wait barrier: after load_model_tokenizer
Original generate method saved.
Model and tokenizer prepared
Entering wait barrier: after model preparation
wait_for_everyone completed within 10 seconds for after model preparation.


INFO:classes.experiment_runner:[Process 3720967] Model is on device: cuda:2
INFO:classes.experiment_runner:[Process 3720966] Model is on device: cuda:1


Exiting wait barrier: after model preparation

INFO:classes.experiment_runner:[Process 3720968] Model is on device: cuda:3





INFO:classes.experiment_runner:[Process 3720965] Model is on device: cuda:0


Entering wait barrier: after logging device info
wait_for_everyone completed within 10 seconds for after logging device info.
Exiting wait barrier: after logging device info
Original generate method reassigned


INFO:classes.experiment_runner:[Process 3720968] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 3720967] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 3720966] Dummy forward pass complete
INFO:classes.experiment_runner:[Process 3720965] Dummy forward pass complete


Entering wait barrier: after dummy forward pass
wait_for_everyone completed within 10 seconds for after dummy forward pass.
Exiting wait barrier: after dummy forward pass
Warm-up iteration 1 completed.Warm-up iteration 1 completed.

Warm-up iteration 1 completed.
Warm-up iteration 1 completed.
Warm-up iteration 2 completed.Warm-up iteration 2 completed.

Warm-up iteration 2 completed.
Warm-up iteration 3 completed.Warm-up iteration 2 completed.Warm-up iteration 3 completed.


Warm-up iteration 3 completed.
Warm-up iteration 3 completed.
Entering wait barrier: after warm up
wait_for_everyone completed within 10 seconds for after warm up.
Exiting wait barrier: after warm up
Prompts processed: 10 prompts.
Energy tracking started
Task type: pure_generative
Using fixed batching (non-adaptive): created 1 batches.


INFO:_6_run_inference_by_task:[Process 3720966][GPU 1] — Completed tokenisation of batch 1/1
INFO:_6_run_inference_by_task:[Process 3720967][GPU 2] — Completed tokenisation of batch 1/1
INFO:_6_run_inference_by_task:[Process 3720968][GPU 3] — Completed tokenisation of batch 1/1
INFO:_6_run_inference_by_task:[Process 3720965][GPU 0] — Completed tokenisation of batch 1/1
INFO:_6_run_inference_by_task:[Process 3720968][GPU 3] — Completed batch inference 1/1
INFO:classes.experiment_runner:[Process 3720968][GPU 3]: Inference complete
INFO:classes.experiment_runner:[Process 3720968][GPU 3]: Energy tracking stopped
INFO:_6_run_inference_by_task:[Process 3720965][GPU 0] — Completed batch inference 1/1
INFO:classes.experiment_runner:[Process 3720965][GPU 0]: Inference complete


Decoded token outputs successfully.


INFO:classes.experiment_runner:[Process 3720965][GPU 0]: Energy tracking stopped


Saved outputs
[DEBUG] Enter get_experiment_setup: Experiment ID: 0270
[DEBUG] Exiting get_experiment_setup with result: {'experiment_id': '0270', 'date_time': 'March 28, 2025 at 08:16:18 PM', 'model': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'is_encoder_decoder': False, 'task_type': 'text_generation', 'available_gpu_count': 4, 'gpu_model': '4 x NVIDIA A100-PCIE-40GB', 'available_cpu_count': 128, 'cpu_model': 'AMD EPYC 7742 64-Core Processor', 'os': 'Linux-5.15.0-113-generic-x86_64-with-glibc2.31', 'python_version': '3.10.14 (main, Apr  6 2024, 18:45:05) [GCC 9.4.0]', 'country': 'Germany', 'region': 'saxony'}
[DEBUG] Enter get_experimental_variables: Accelerator index: 0
[DEBUG] Exiting get_experimental_variables with result: {'max_input_tokens': 500, 'max_output_tokens': 200, 'number_input_prompts': 10, 'decode_token_to_text': True, 'decoder_temperature': 1.0, 'query_rate': 1, 'fp_precision': 'torch.float32', 'quantisation': {'quantization': False, 'cached_flops_for_quantised_models': 517

INFO:classes.experiment_runner:Main process saved (i) experiment setup, (ii) variables, (iii) model architecture.


Experiment-wide meta info saved
[DEBUG] Enter combine_inference_metrics: Accelerator index: 0
[DEBUG] Exiting combine_inference_metrics with result: {'number_input_prompts': 10, 'total_input_tokens': 5000, 'total_generated_tokens': 2000} & {'total_inference_time_sec': 8.481884712004103, 'average_latency_ms_per_batch': 8481.884712004103, 'throughput_queries_per_sec': 1.1789832495420933, 'throughput_tokens_per_sec': 235.7966499084187}
[DEBUG] Enter combine_comp_metrics: Accelerator index: 0
[DEBUG] All samples have length 500. Computing FLOPs for one sample.
[DEBUG] Computed FLOPs for one sample: 517272064000


INFO:_6_run_inference_by_task:[Process 3720967][GPU 2] — Completed batch inference 1/1
INFO:classes.experiment_runner:[Process 3720967][GPU 2]: Inference complete
INFO:classes.experiment_runner:[Process 3720967][GPU 2]: Energy tracking stopped


[DEBUG] Exiting combine_comp_metrics with result: flops = 5172720640000; memory = {'gpu_current_memory_allocated_bytes': 8818003456, 'gpu_max_memory_allocated_bytes': 8818003456, 'gpu_current_memory_reserved_bytes': 10938744832, 'gpu_max_memory_reserved_bytes': 10938744832}; compute_util = {'gpu_utilization_percent': [100.0, 100.0, 99.0, 100.0], 'cpu_usage_percent': 6.4, 'cpu_memory_usage_bytes': 1870880768}


INFO:classes.experiment_runner:Main process saved inference and computation metrics.


Experiment-wide inference and compute metrics saved
Entering wait barrier: after saving experiment metrics


INFO:_6_run_inference_by_task:[Process 3720966][GPU 1] — Completed batch inference 1/1
INFO:classes.experiment_runner:[Process 3720966][GPU 1]: Inference complete
INFO:classes.experiment_runner:[Process 3720966][GPU 1]: Energy tracking stopped


[DEBUG] Enter combine_energy_metrics: Process ID: 3720966, Local process index: 1wait_for_everyone completed within 10 seconds for after saving experiment metrics.
[DEBUG] Enter combine_energy_metrics: Process ID: 3720967, Local process index: 2
[DEBUG] Energy consumed: 0.0029487773034882724 kWh, which equals 10615.598292557781 joules.
[DEBUG] Enter combine_energy_metrics: Process ID: 3720968, Local process index: 3Exiting wait barrier: after saving experiment metrics
[DEBUG] Energy consumed: 0.0027371611434533726 kWh, which equals 9853.78011643214 joules.

[DEBUG] Energy consumed: 0.002023532301925415 kWh, which equals 7284.716286931494 joules.
[DEBUG] Exiting combine_energy_metrics with result: {'process_id': 3720966, 'local_process_index': 1, 'energy_results': {'cpu_power': 112.5, 'gpu_power': 750.8266498192522, 'ram_power': 2.710796356201172, 'cpu_energy': 0.00032645648790639826, 'gpu_energy': 0.0026153765367418202, 'ram_energy': 6.944278840053766e-06, 'total_energy_kwh': 0.0029487

INFO:classes.experiment_runner:Process 1: Energy metrics combined successfully.



[DEBUG] Energy consumed: 0.0024815524754756017 kWh, which equals 8933.588911712166 joules.

INFO:classes.experiment_runner:Process 3: Energy metrics combined successfully.
INFO:classes.experiment_runner:[Process 3720966][GPU 1] saved its energy metrics.
INFO:classes.experiment_runner:Process 2: Energy metrics combined successfully.





INFO:classes.experiment_runner:[Process 3720968][GPU 3] saved its energy metrics.
INFO:classes.experiment_runner:[Process 3720967][GPU 2] saved its energy metrics.


[DEBUG] Exiting combine_energy_metrics with result: {'process_id': 3720965, 'local_process_index': 0, 'energy_results': {'cpu_power': 112.5, 'gpu_power': 897.0453781888289, 'ram_power': 2.7081956863403325, 'cpu_energy': 0.00026762310513004197, 'gpu_energy': 0.0022083678778006544, 'ram_energy': 5.5614925449053665e-06, 'total_energy_kwh': 0.0024815524754756017, 'total_energy_joules': 8933.588911712166, 'final_emissions': 0.0009453474155324305}}


INFO:classes.experiment_runner:Process 0: Energy metrics combined successfully.
INFO:classes.experiment_runner:[Process 3720965][GPU 0] saved its energy metrics.


All local process energy metrics saved
Experiment finished
Destroyed process group successfully


INFO:classes.experiment_runner:Aggregating per-process energy metrics from disk.
INFO:classes.experiment_runner:Aggregated global energy results successfully from disk.
INFO:classes.experiment_runner:Saving experiment results
INFO:classes.experiment_runner:Experiment results saved to results/text_generation_results.json


Starting teardown process...
Process group not initialized.
Emptying CUDA cache...
Running garbage collection...
Teardown process complete.


INFO:__main__:Experiment run succeeded on attempt 1
