In [43]:
import sys
import os
import matplotlib.pyplot as plt

# Add the parent directory to Python path
parent_dir = os.path.dirname(os.getcwd())
sys.path.insert(0, parent_dir)

In [44]:
baseline_values = {
    "sum_decode_tokens": 3.38283913e-05,
    "sum_prefill_tokens": 9.82346868e-06,
    "max_prefill_tokens": -3.11237143e-06,
    "num_prefills": 1.50291993e-03,
    "sum_decode_tokenss2": 4.24173346e-08,
    "sum_decode_tokensmsumprefill_tokens": -1.06897441e-07,
    "sum_decode_tokensmmaxprefill_tokens": 1.92844617e-07,
    "sum_decode_tokensmnumprefills": 2.60430816e-05,
    "sum_prefill_tokenss2": -7.72212201e-09,
    "sum_prefill_tokensmmaxprefill_tokens": 2.67059068e-08,
    "sum_prefill_tokensmnumprefills": 7.20303280e-06,
    "max_prefill_tokenss2": -1.06904337e-08,
    "max_prefill_tokensmnumprefills": -1.05254706e-05,
    "num_prefillss2": -9.19828725e-04,
    "intercept": 0.005708624032334771,
    "schedule_time": 544,
    "update_time": 80,
    "queue_overhead": 1000,
    "vllm_overhead": 6000
}

In [None]:
# Bounded region of parameter space


pbounds = {
    'sum_decode_tokens': (0, 0.0001),
    'sum_prefill_tokens': (0, 0.00001),
    'max_prefill_tokens': (0, 0.00001),
    'num_prefills': (0, 0.01),
    'sum_decode_tokenss2': (0, 0.0000001),
    'sum_decode_tokensmsumprefill_tokens': (0, 0.000001),
    'sum_decode_tokensmmaxprefill_tokens': (0, 0.000001),
    'sum_decode_tokensmnumprefills': (0, 0.0001),
    'sum_prefill_tokenss2': (0, 0.0000001), 
    'sum_prefill_tokensmmaxprefill_tokens': (0, 0.000001),
    'sum_prefill_tokensmnumprefills': (0, 0.0001),
    'max_prefill_tokenss2': (0, 0.000001),
    'max_prefill_tokensmnumprefills': (0, 0.0001),
    'num_prefillss2': (0, 0.001),
    'intercept': (0, 0.01),
    'schedule_time': (0, 600),
    'update_time': (0, 100),
    'queue_overhead': (0, 1100),
    'vllm_overhead': (0, 6000),}

In [None]:
# helper functions for running commands and parsing output
import subprocess
import numpy as np

from pytools.experiments.utils import find_matching_experiments, parse_sim_results, parse_vllm_results, calculate_errors, plot_results

def run_command(command):
    try:
        # print(f"Running command: {' '.join(command)}")
        result = subprocess.run(command, check=True, capture_output=True)
        # print(f"Command '{' '.join(command)}' executed successfully.")
        return result.stdout.decode('utf-8')
    
    
    except subprocess.CalledProcessError as e:
        print(f"Error executing command '{' '.join(command)}': {e}")
        print(f"Output: {e.output}")
        print(f"Error output: {e.stderr}")
        raise
    
def parse_output(raw_results: str):
    # split tests by newlines
    
    results = raw_results.split("\n\n")

    sim_values = []
    
    for i in range(len(results)-1):
        result = results[i]
    
        # find list after "TTFTs             :"
        try:
            start_index = result.find("TTFTs             :") + len("TTFTs             :")
            end_index = result.find("\n", start_index)
            ttfts_str = result[start_index:end_index-3].strip()
            ttfts = [float(x) for x in ttfts_str.strip("[]").split(", ")]
            # find the list after "E2Es             :"
            start_index = result.find("E2Es             :") + len("E2Es             :")
            end_index = result.find("\n", start_index)
            e2es_str = result[start_index:end_index-3].strip()
            e2es = [float(x) for x in e2es_str.strip("[]").split(", ")]
            sim_values.append((np.array(ttfts), np.array(e2es)))
        except ValueError as e:
            print(f"Error parsing output: {e}")
            sim_values.append((np.array([]), np.array([])))

    return sim_values

def parse_outputs_from_file(sim_dir: str, vllm_dir: str, sweep_configs: dict):
    # sim_dir = "./results/sweep_params"
    # vllm_dir = "./blackbox"

    # sweep_configs = {
    #     'num_prompts': [400],
    #     'request_rate': [32],
    #     'temperature': [0.0],
    #     'max_num_batched_tokens': [256],
    #     'long_prefill_token_threshold': [16],
    #     'datasets': [{'name': 'sharegpt', 'path': 'ShareGPT_V3_unfiltered_cleaned_split.json'}]
    # }
    # # Find matching experiments
    matching_pairs = find_matching_experiments(sim_dir, vllm_dir, sweep_configs)

    # print(f"Found {len(matching_pairs)} matching experiment pairs")
    vllm_ttfts = []
    vllm_e2es = []
    sim_ttfts = []
    sim_e2es = []
    # Process each pair
    for sim_file, vllm_file, config in matching_pairs:
        # print(f"\nProcessing: {os.path.basename(sim_file)}")
        
        # Parse results
        sim_results = parse_sim_results(sim_file)
        sim_ttfts.extend(sim_results['ttfts'])
        sim_e2es.extend(sim_results['e2es'])
        vllm_results = parse_vllm_results(vllm_file)
        vllm_ttfts.extend(vllm_results['ttfts'])
        vllm_e2es.extend(vllm_results['e2es'])

    vllm_values = (np.array(vllm_ttfts), np.array(vllm_e2es))
    sim_values = (np.array(sim_ttfts), np.array(sim_e2es))
    return sim_values, vllm_values

def get_error(sim_values, vllm_values):
    # calculate mse
    if len(sim_values) != len(vllm_values):
        raise ValueError("Length of simulation values and vLLM values must match")
    
    vllm_ttfts, vllm_e2es = vllm_values
    sim_ttfts, sim_e2es = sim_values
    mse = 0.0
    for i in range(len(vllm_ttfts)):
        mse_ttfts = (sim_ttfts[i] - vllm_ttfts[i]) ** 2
        mse_e2es = (sim_e2es[i] - vllm_e2es[i]) ** 2
        mse += (mse_ttfts + mse_e2es) / 2
        
    mse /= len(vllm_ttfts)
    return mse


# main optimizer function

def black_box_function(sum_decode_tokens: float, sum_prefill_tokens: float, max_prefill_tokens: float, num_prefills: float, sum_decode_tokenss2: float, sum_decode_tokensmsumprefill_tokens: float, sum_decode_tokensmmaxprefill_tokens: float, sum_decode_tokensmnumprefills: float, sum_prefill_tokenss2: float, sum_prefill_tokensmmaxprefill_tokens: float, sum_prefill_tokensmnumprefills: float, max_prefill_tokenss2: float, max_prefill_tokensmnumprefills: float, num_prefillss2: float, intercept: float, schedule_time: float, update_time: float, queue_overhead: float, vllm_overhead: float):
    
    # run a python subprocess to execute the vllm command
    sim_dir = "./results/sweep_params"
    vllm_dir = "./blackbox"

    sweep_configs = {
        'num_prompts': [100, 200, 400, 800, 1600],
        'request_rate': [4, 8, 16, 32, 64, 128],
        'temperature': [0.0],
        'max_num_batched_tokens': [256, 512, 1024, 2048, 4096, 8192],
        'long_prefill_token_threshold': [16, 32, 64, 512, 1024, 512, 1024],
        'datasets': [{'name': 'sharegpt', 'path': 'ShareGPT_V3_unfiltered_cleaned_split.json'}]
    }

    coefficients = [sum_decode_tokens, sum_prefill_tokens, max_prefill_tokens, num_prefills, sum_decode_tokenss2, sum_decode_tokensmsumprefill_tokens, sum_decode_tokensmmaxprefill_tokens, sum_decode_tokensmnumprefills, sum_prefill_tokenss2, sum_prefill_tokensmmaxprefill_tokens, sum_prefill_tokensmnumprefills, max_prefill_tokenss2, max_prefill_tokensmnumprefills, num_prefillss2, intercept]
    coefficients_str = ','.join(map(str, coefficients))

    # schedule_time = 544
    # update_time = 80
    # queue_overhead = 1000
    # vllm_overhead = 6000

    command = ["python","request_rate_sweep.py", "--rates"]
    command.extend(list(map(str, sweep_configs["request_rate"])))
    command.append("--long_prefill_token_thresholds")
    command.extend(list(map(str, sweep_configs["long_prefill_token_threshold"])))
    command.append("--max_num_batched_tokens")
    command.extend(list(map(str, sweep_configs["max_num_batched_tokens"])))
    command.append("--num_requests")
    command.extend(list(map(str, sweep_configs["num_prompts"])))
    command.extend(["--input_filename", "data/output_tokens_2025-06-30_tokenized.json",
            "--regression_coeffs", f'{coefficients_str}',
            "--schedule_time", f"{str(int(schedule_time))}",
            "--update_time", f"{str(int(update_time))}",
            "--queue_overhead_time", f"{str(int(queue_overhead))}",
            "--vllm_overhead_time", f"{str(int(vllm_overhead))}"])
    result = run_command(command)
    
    sim_values, vllm_values = parse_outputs_from_file(sim_dir, vllm_dir, sweep_configs)

    error = get_error(sim_values, vllm_values)
    
    return error


In [47]:
def objective(trial):
    sum_decode_tokens = trial.suggest_float('sum_decode_tokens', *pbounds['sum_decode_tokens'])
    sum_prefill_tokens = trial.suggest_float('sum_prefill_tokens', *pbounds['sum_prefill_tokens'])
    max_prefill_tokens = trial.suggest_float('max_prefill_tokens', *pbounds['max_prefill_tokens'])
    num_prefills = trial.suggest_float('num_prefills', *pbounds['num_prefills'])
    sum_decode_tokenss2 = trial.suggest_float('sum_decode_tokenss2', *pbounds['sum_decode_tokenss2'])
    sum_decode_tokensmsumprefill_tokens = trial.suggest_float('sum_decode_tokensmsumprefill_tokens', *pbounds['sum_decode_tokensmsumprefill_tokens'])
    sum_decode_tokensmmaxprefill_tokens = trial.suggest_float('sum_decode_tokensmmaxprefill_tokens', *pbounds['sum_decode_tokensmmaxprefill_tokens'])
    sum_decode_tokensmnumprefills = trial.suggest_float('sum_decode_tokensmnumprefills', *pbounds['sum_decode_tokensmnumprefills'])
    sum_prefill_tokenss2 = trial.suggest_float('sum_prefill_tokenss2', *pbounds['sum_prefill_tokenss2'])
    sum_prefill_tokensmmaxprefill_tokens = trial.suggest_float('sum_prefill_tokensmmaxprefill_tokens', *pbounds['sum_prefill_tokensmmaxprefill_tokens'])
    sum_prefill_tokensmnumprefills = trial.suggest_float('sum_prefill_tokensmnumprefills', *pbounds['sum_prefill_tokensmnumprefills'])
    max_prefill_tokenss2 = trial.suggest_float('max_prefill_tokenss2', *pbounds['max_prefill_tokenss2'])
    max_prefill_tokensmnumprefills = trial.suggest_float('max_prefill_tokensmnumprefills', *pbounds['max_prefill_tokensmnumprefills'])
    num_prefillss2 = trial.suggest_float('num_prefillss2', *pbounds['num_prefillss2'])
    intercept = trial.suggest_float('intercept', *pbounds['intercept'])
    schedule_time = trial.suggest_float('schedule_time', *pbounds['schedule_time'])
    update_time = trial.suggest_float('update_time', *pbounds['update_time'])
    queue_overhead = trial.suggest_float('queue_overhead', *pbounds['queue_overhead'])
    vllm_overhead = trial.suggest_float('vllm_overhead', *pbounds['vllm_overhead'])
    return black_box_function(sum_decode_tokens, sum_prefill_tokens, max_prefill_tokens, num_prefills, sum_decode_tokenss2, sum_decode_tokensmsumprefill_tokens, sum_decode_tokensmmaxprefill_tokens, sum_decode_tokensmnumprefills,sum_prefill_tokenss2, sum_prefill_tokensmmaxprefill_tokens, sum_prefill_tokensmnumprefills, max_prefill_tokenss2, max_prefill_tokensmnumprefills, num_prefillss2, intercept, schedule_time, update_time, queue_overhead, vllm_overhead)

In [None]:

# from bayes_opt import BayesianOptimization
import optuna
from optuna.samplers import TPESampler

sampler = TPESampler(seed=10)
study = optuna.create_study(sampler=sampler)

# for warm start with baseline values
study.enqueue_trial(baseline_values)

study.optimize(objective, n_trials=200)

print (study.best_params)
# optimizer = BayesianOptimization(
#     f=black_box_function,
#     pbounds=pbounds,
#     random_state=1,
# )

# optimizer.maximize(
#     init_points=20,
#     n_iter=100,
# )


[I 2025-07-14 13:56:33,421] A new study created in memory with name: no-name-a418c812-107e-48ec-8e7d-3b745b83c37b


[I 2025-07-14 13:56:34,809] Trial 0 finished with value: 0.46004675764612224 and parameters: {'sum_decode_tokens': 3.8417561744097597e-05, 'sum_prefill_tokens': 7.469529572290514e-06, 'max_prefill_tokens': -2.904389955972766e-06, 'num_prefills': 0.0016898860868643296, 'sum_decode_tokenss2': 4.238567032064265e-08, 'sum_decode_tokensmsumprefill_tokens': -1.2160670817368414e-07, 'sum_decode_tokensmmaxprefill_tokens': 1.6373114139824623e-07, 'sum_decode_tokensmnumprefills': 2.9435592898551796e-05, 'sum_prefill_tokenss2': -8.999705255925466e-09, 'sum_prefill_tokensmmaxprefill_tokens': 2.1209027522030217e-08, 'sum_prefill_tokensmnumprefills': 7.870609225752643e-06, 'max_prefill_tokenss2': -8.266947946240942e-09, 'max_prefill_tokensmnumprefills': -1.3136059569422284e-05, 'num_prefillss2': -0.0009142213279574984, 'intercept': 0.006600941799684202, 'schedule_time': 574.6070901775936, 'update_time': 88.87021269727198, 'queue_overhead': 895.9380340853165, 'vllm_overhead': 7253.32236753883}. Best 

{'sum_decode_tokens': 2.6345552228978724e-05, 'sum_prefill_tokens': 8.646580073144613e-06, 'max_prefill_tokens': -3.697759554598298e-06, 'num_prefills': 0.0017764070670311667, 'sum_decode_tokenss2': 3.841326107894795e-08, 'sum_decode_tokensmsumprefill_tokens': -1.2880883836898533e-07, 'sum_decode_tokensmmaxprefill_tokens': 1.8594719672479232e-07, 'sum_decode_tokensmnumprefills': 3.040095150231441e-05, 'sum_prefill_tokenss2': -7.723753755349376e-09, 'sum_prefill_tokensmmaxprefill_tokens': 2.52185947798057e-08, 'sum_prefill_tokensmnumprefills': 6.40225826093979e-06, 'max_prefill_tokenss2': -9.471605598732113e-09, 'max_prefill_tokensmnumprefills': -1.0081548100282114e-05, 'num_prefillss2': -0.0006899261322254075, 'intercept': 0.0059448950757889476, 'schedule_time': 553.5511306108177, 'update_time': 71.11874589329447, 'queue_overhead': 867.5172418867629, 'vllm_overhead': 7423.1023167412095}


In [37]:
# targets = []
# for res in optimizer.res:
#     target = res['target']
#     targets.append(float(target))

# plt.plot(targets)

# plt.xlabel("Optimizer Steps")
# plt.ylabel("MSE loss")
# plt.title("Optimizer loss over steps")

# plt.show()

optuna.visualization.plot_optimization_history(study)

In [38]:
# Get the best parameters
# best_params = optimizer.max['params']
# best_value = optimizer.max['target']

print("Best parameters found:")
for param in study.best_params:
    print(f"  {param} = {study.best_params[param]},")

print(f"\nBest objective value: {study.best_value}")


Best parameters found:
  sum_decode_tokens = 2.6345552228978724e-05,
  sum_prefill_tokens = 8.646580073144613e-06,
  max_prefill_tokens = -3.697759554598298e-06,
  num_prefills = 0.0017764070670311667,
  sum_decode_tokenss2 = 3.841326107894795e-08,
  sum_decode_tokensmsumprefill_tokens = -1.2880883836898533e-07,
  sum_decode_tokensmmaxprefill_tokens = 1.8594719672479232e-07,
  sum_decode_tokensmnumprefills = 3.040095150231441e-05,
  sum_prefill_tokenss2 = -7.723753755349376e-09,
  sum_prefill_tokensmmaxprefill_tokens = 2.52185947798057e-08,
  sum_prefill_tokensmnumprefills = 6.40225826093979e-06,
  max_prefill_tokenss2 = -9.471605598732113e-09,
  max_prefill_tokensmnumprefills = -1.0081548100282114e-05,
  num_prefillss2 = -0.0006899261322254075,
  intercept = 0.0059448950757889476,
  schedule_time = 553.5511306108177,
  update_time = 71.11874589329447,
  queue_overhead = 867.5172418867629,
  vllm_overhead = 7423.1023167412095,

Best objective value: 0.2890180716271883


In [24]:
# Test on baseline parameters
black_box_function(**baseline_values)


np.float64(14.593905009127141)

In [48]:
# Test on regression best parameters
black_box_function(sum_decode_tokens=4.29457908e-05,
    sum_prefill_tokens=1.32742879e-05,
    max_prefill_tokens=-5.39382272e-06,
    num_prefills=1.64539882e-04,
    sum_decode_tokenss2=4.55100279e-08,
    sum_decode_tokensmsumprefill_tokens=1.67622486e-08,
    sum_decode_tokensmmaxprefill_tokens=1.20466352e-07,
    sum_decode_tokensmnumprefills=2.47965016e-06,
    sum_prefill_tokenss2=2.60209494e-09,
    sum_prefill_tokensmmaxprefill_tokens=-1.00133429e-08,
    sum_prefill_tokensmnumprefills=-3.26463654e-07,
    max_prefill_tokenss2=7.73080154e-09,
    max_prefill_tokensmnumprefills=-3.70684738e-07,
    num_prefillss2=-8.03421694e-06,
    intercept=0.0056702534775332145,
    schedule_time=150,
    update_time=80,
    queue_overhead=1500,
    vllm_overhead=6000)

np.float64(11.042294365945464)

In [41]:
black_box_function(**study.best_params)

np.float64(17.009667382522622)

In [28]:
import os

vllm_dir = "./blackbox"
sim_dir = "./results/sweep_params"
sweep_configs = {
    'num_prompts': [400],
    'request_rate': [32],
    'temperature': [0.0],
    'max_num_batched_tokens': [256, 512, 1024, 2048, 4096, 8192],
    'long_prefill_token_threshold': [16, 32, 64, 128, 256, 512, 1024],
    'datasets': [{'name': 'sharegpt', 'path': 'ShareGPT_V3_unfiltered_cleaned_split.json'}]
}


# graphing parameters
x_axis = 'long_prefill_token_threshold'
y_axis = 'max_num_batched_tokens'
metrics = ['ttft_accuracy', 'tpot_accuracy', 'e2e_accuracy', 'duration_accuracy']
plotting_data = {}
for metric in metrics:
    plotting_data[metric] = {}

# Find matching experiments
matching_pairs = find_matching_experiments(sim_dir, vllm_dir, sweep_configs)

print(f"Found {len(matching_pairs)} matching experiment pairs")

# Process each pair
for sim_file, vllm_file, config in matching_pairs:
    print(f"\nProcessing: {os.path.basename(sim_file)}")
    
    # Parse results
    sim_results = parse_sim_results(sim_file)
    vllm_results = parse_vllm_results(vllm_file)
    
    # Calculate errors
    errors = calculate_errors(sim_results, vllm_results)
    # Print results
    for metric, value in errors.items():
        print(f"{metric}: {value}")
        

# plot_results(plotting_data, x_axis, y_axis, sweep_configs)

Found 39 matching experiment pairs

Processing: exp_400p_32r_0.0t_256mbt_16lpt_sharegpt.txt
ttft_mse: 0.006707970915306442
ttft_accuracy: 68.23163080971142
tpot_mse: 5.26089310338664e-05
tpot_accuracy: 84.99729067216111
e2e_mse: 0.2015038035228766
e2e_accuracy: 80.7869021021387
duration_accuracy: 93.36633663366337

Processing: exp_400p_32r_0.0t_256mbt_32lpt_sharegpt.txt
ttft_mse: 0.002937600541552581
ttft_accuracy: 68.11654119430756
tpot_mse: 3.783332868577224e-05
tpot_accuracy: 86.99288441484642
e2e_mse: 0.24164349124574866
e2e_accuracy: 77.7687918722904
duration_accuracy: 95.25882352941176

Processing: exp_400p_32r_0.0t_256mbt_64lpt_sharegpt.txt
ttft_mse: 0.0027068898401245
ttft_accuracy: 54.34931547119383
tpot_mse: 2.9842517139185325e-05
tpot_accuracy: 77.30678987179606
e2e_mse: 0.5537600806603118
e2e_accuracy: 63.851887353591735
duration_accuracy: 96.9503546099291

Processing: exp_400p_32r_0.0t_256mbt_128lpt_sharegpt.txt
ttft_mse: 0.002247400853287913
ttft_accuracy: 43.630782653276

In [12]:
ttft_mse: 0.025188282731345088
ttft_accuracy: 48.2461751138668
tpot_mse: 3.695219311984646e-05
tpot_accuracy: 59.17752753509921
e2e_mse: 1.9071773913358943
e2e_accuracy: 54.04913470013282
duration_accuracy: 94.59522422830518