In [3]:
import sys
import os

project_root = os.getcwd()  
if project_root not in sys.path:
    sys.path.append(project_root)
    
from classes.experiment_config import ExperimentConfig
from classes.experiment_runner import ExperimentRunner

import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

from accelerate import notebook_launcher
from datasets import load_dataset


In [None]:
experiment_config = ExperimentConfig(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    is_encoder_decoder="decoder_only",
    task_type="text_generation",
    inference_type="pure_generative",  # must match your ExperimentRunner expectations
    max_input_tokens=512,
    max_output_tokens=50,
    num_input_prompts=5,
    gpu_list=[0, 1],
    num_processes=2,
    batching_options={
        "fixed_max_batch_size": 1, # this is max batch size if adaptive batching on; fixed batch size if it's off
        "adaptive_batching": False,
        "adaptive_max_tokens": 256  
    },
    sharding_config={
        "fsdp_config": {
            "use_orig_params": True,
            "cpu_offload": True
        },
        "sharding_strategy": "NO_SHARD"
    },
    query_rate=1,
    decoder_temperature=1,
    fp_precision="float16",
    backend="pytorch"
)

# Load prompts (here, using the 'arxiv' split from the lighteval/pile_helm dataset)
ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
prompts = [sample["text"] for sample in ds]

# Define a function to run the experiment.
def run_experiment():
    runner = ExperimentRunner(experiment_config, prompts)
    runner.run_torch()

# Launch the experiment across the specified number of processes.
notebook_launcher(run_experiment, num_processes=experiment_config.num_processes)

Launching training on 2 GPUs.
TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded using backend pytorch with precision float16.
TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded using backend pytorch with precision float16.


INFO:classes.experiment_runner:[Process 1160056] Model is on device: cuda:1
INFO:classes.experiment_runner:[Process 1160055] Model is on device: cuda:0


Original generate method reassigned


INFO:classes.experiment_runner:[Process 1160056] Dummy forward pass complete.
INFO:classes.experiment_runner:[Process 1160055] Dummy forward pass complete.


Allocated GPUs: GPU 0: NVIDIA A100-PCIE-40GB, GPU 1: NVIDIA A100-PCIE-40GB
Prompts processed: 5 prompts. Longest prompt has 71508 characters.




Energy tracking started
pure_generative task type
Fixed batching created 1 batches.


INFO:classes.experiment_runner:[Process 1160056] Inference complete.
INFO:classes.experiment_runner:[Process 1160055] Inference complete.


Flops estimation was not finished successfully because of the following exception:
<class 'RuntimeError'> : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)


Traceback (most recent call last):
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/ptflops/pytorch_engine.py", line 68, in get_flops_pytorch
    _ = flops_model(batch)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
    else self._run_ddp_forward(*inputs, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
    return self.module(*inputs, **kwargs)  # type: ignore[index]
  File "/home/228755@hertie-s

Energy tracking stopped
Flops estimation was not finished successfully because of the following exception:
<class 'RuntimeError'> : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)


Traceback (most recent call last):
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/ptflops/pytorch_engine.py", line 68, in get_flops_pytorch
    _ = flops_model(batch)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
    else self._run_ddp_forward(*inputs, **kwargs)
  File "/home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
    return self.module(*inputs, **kwargs)  # type: ignore[index]
  File "/home/228755@hertie-s