In [8]:
import time

import numpy as np
import wandb
import yaml

from pathlib import Path

from classifier.file_reader import read_files_from_folder
from evaluations.utils.wandb_loader import download_log_data, load_all_histories_to_dataframe

MODEL_FAMILY = "qwen2"
BENCHMARKS = [
	"arc_challenge",
	"arc_easy",
	"boolq",
	# "lambada_standard",
	"logiqa",
	# "logiqa2",
	"piqa",
	"sciq",
	"social_iqa",
	"winogrande"
]
# this refers to whether we want to use a pre-trained classifier or learn the classifier online while benchmarking.
APPROACH = "online"  # alt: online
# This only has an effect when APPROACH = pretrained. Make sure to adjust the minibatch size accordingly!
NUM_PRETRAINING_STEPS = 400
SEEDS = [42]
NUM_CLASSIFIER_LABELS = 3

PROJECT_ROOT_PATH = Path("mess_plus_simulator").parent

In [9]:
def choose_llm_from_zoo(available_models: list, probs: np.array):
	return np.random.choice(available_models, p=probs)

In [10]:
def calculate_probabilities(model_accuracies, alpha):
    """
    Function to compute a priori probabilities for a baseline model selection process.
    """
    accuracies = np.array(model_accuracies)
    n = len(accuracies)

    # Check if possible
    if alpha > max(accuracies):
        raise ValueError("Alpha too high")

    # Method 1: Try iterative approach
    p = np.ones(n) / n

    for _ in range(5000):
        current_acc = np.dot(p, accuracies)

        if current_acc >= alpha - 1e-6:
            return p

        # Simple update
        for i in range(n):
            if accuracies[i] > current_acc:
                p[i] *= 1.01  # Increase good models
            else:
                p[i] *= 0.99  # Decrease bad models

        # Normalize
        p = p / np.sum(p)

    # Method 2: If iterative fails, use direct calculation
    # Sort by accuracy
    idx_sorted = np.argsort(accuracies)[::-1]

    # Calculate minimum probability for best model
    best_acc = accuracies[idx_sorted[0]]
    worst_acc = accuracies[idx_sorted[-1]]

    # Start with minimum probabilities for all
    min_prob = 1e-10
    p = np.full(n, min_prob)
    remaining = 1.0 - n * min_prob

    # Distribute remaining probability
    for i in range(n):
        idx = idx_sorted[i]

        if i == n - 1:
            p[idx] += remaining
        else:
            # Give more to better models
            weight = (accuracies[idx] - worst_acc) / (best_acc - worst_acc)
            allocation = remaining * weight * 0.8
            p[idx] += allocation
            remaining -= allocation

    # Final normalization
    p = p / np.sum(p)

    return p

In [None]:
for BENCHMARK_NAME in BENCHMARKS:

	# Load benchmark config
	config_path = Path(f"{PROJECT_ROOT_PATH}/config/{MODEL_FAMILY}/online/{BENCHMARK_NAME}.yaml")
	NUM_PRETRAINING_STEPS = 0

	with config_path.open("r") as f:
		CONFIG = yaml.safe_load(f)
		display(CONFIG)

	algorithm_config = CONFIG["algorithm"]

	# Load data
	try:
		input_df = read_files_from_folder(folder_path=f"{PROJECT_ROOT_PATH}/data/{MODEL_FAMILY}/inference_outputs/{BENCHMARK_NAME}")
	except ValueError:
		continue

	input_df["idx_original"] = input_df.index
	input_df = input_df.sample(frac=1).reset_index(drop=True)
	available_models = [i for i in input_df.columns if "label" in i]

	# Get inputs for chocie probabilities
	model_performance = {}
	for model in available_models:
		model_performance[model] = input_df[model].mean()

	model_keys = [i for i in model_performance.keys()]
	for alpha in algorithm_config["alpha_values"]:
		probs = calculate_probabilities([i for i in model_performance.values()], alpha)

		run = wandb.init(
			project="mess_plus_qwen2_random_baseline_with_constraint_v01",
			name=f"{BENCHMARK_NAME}_alpha={alpha}",
			config=CONFIG
		)

		MODEL_CHOICES = []
		ACCURACY_LIST = []
		Q = 0.0
		for idx, row in input_df.iterrows():
			statistics_dict = {}
			model_choice = choose_llm_from_zoo(model_keys, probs)
			model_name = model_choice.split("_")[-1]

			MODEL_CHOICES.append(model_keys.index(model_choice))
			ACCURACY_LIST.append(row[model_choice])

			x = np.array(MODEL_CHOICES)
			Q = max(0.0, Q + alpha - row[model_choice])

			statistics_dict["model_choice"] = model_keys.index(model_choice)
			statistics_dict["avg_accuracy"] = sum(ACCURACY_LIST) / (idx + 1)
			statistics_dict["models/xsmall_chosen"] = len(np.where(x == 0)[0]) / (idx + 1)
			statistics_dict["models/small_chosen"] = len(np.where(x == 1)[0]) / (idx + 1)
			statistics_dict["models/medium_chosen"] = len(np.where(x == 2)[0]) / (idx + 1)
			statistics_dict["models/large_chosen"] = len(np.where(x == 3)[0]) / (idx + 1)
			statistics_dict["mess_plus/energy"] = row[f"energy_consumption_{model_name}"]
			statistics_dict["mess_plus/q_length"] = Q

			if wandb.run is not None:
				wandb.log(statistics_dict, step=idx)

		wandb.finish()

In [12]:
# Comparison with RouteLLM
DATA_DIR = f"{PROJECT_ROOT_PATH}/data/routellm_raw"
routellm_logs = download_log_data(
    entity="tum-i13",
    project_name="routellm-sweepv2",
    save_dir=DATA_DIR,
    batch_size=50
)

routellm_df = load_all_histories_to_dataframe(DATA_DIR)


In [13]:
routellm_df[["router_model", "benchmark", "thr", "threshold"]] = routellm_df["run_name"].str.split("-", expand=True)
BENCHMARKS = routellm_df["benchmark"].unique().tolist()

print(BENCHMARKS)

MODEL_CHOICE_DICT = {
    1: "large",
    0: "small",
}

for benchmark in BENCHMARKS:
    # Load benchmark config
    config_path = Path(f"{PROJECT_ROOT_PATH}/config/online/{benchmark}.yaml")
    NUM_PRETRAINING_STEPS = 0

    with config_path.open("r") as f:
        CONFIG = yaml.safe_load(f)

    algorithm_config = CONFIG["algorithm"]

    try:
        benchmark_raw_df = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/inference_outputs/{benchmark}")
    except ValueError:
	    continue

    THRESHOLDS = routellm_df.loc[routellm_df["benchmark"] == benchmark, "threshold"].unique().tolist()

    display(f"BENCHMARK: {benchmark} - {benchmark_raw_df['label_large'].mean()}")

    for alpha in algorithm_config["alpha_values"]:
        for threshold in THRESHOLDS:

            run = wandb.init(
                project="routellm_baseline_v01",
                name=f"{benchmark}_alpha={alpha}_thres={threshold}",
                config=CONFIG
            )

            MODEL_CHOICES = []
            ACCURACY_LIST = []
            Q = 0.0

            iterator = 0
            for idx, row in routellm_df.loc[(routellm_df["benchmark"] == benchmark) & (routellm_df["threshold"] == threshold)].iterrows():
                statistics_dict = {}
                model_choice_id = row["model_choice"]

                # Get accuracy from raw evaluations
                chosen_model = MODEL_CHOICE_DICT[model_choice_id]
                row_benchmark_data = benchmark_raw_df.loc[(benchmark_raw_df.index == row["document_id"])]

                result_data = row_benchmark_data[f"label_{chosen_model}"].values
                try:
                    result = result_data[0]
                except IndexError as e:
                    print(f"Error encountered for {benchmark}: #{iterator}")
                    continue

                MODEL_CHOICES.append(model_choice_id)
                ACCURACY_LIST.append(result)
                x = np.array(MODEL_CHOICES)
                Q = max(0.0, Q + alpha - result)

                total_energy = row_benchmark_data[f"energy_consumption_{chosen_model}"].values[0] + row["energy"]

                statistics_dict["model_choice"] = model_choice_id
                statistics_dict["avg_accuracy"] = sum(ACCURACY_LIST) / (iterator + 1)
                statistics_dict["models/small_chosen"] = len(np.where(x == 0)[0]) / (iterator + 1)
                statistics_dict["models/large_chosen"] = len(np.where(x == 1)[0]) / (iterator + 1)
                statistics_dict["mess_plus/energy"] = total_energy
                statistics_dict["routellm/router_energy"] = row["energy"]
                statistics_dict["routellm/inference_energy"] = row_benchmark_data[f"energy_consumption_{chosen_model}"].values[0]
                statistics_dict["mess_plus/q_length"] = Q

                if wandb.run is not None:
                    wandb.log(statistics_dict, step=iterator)

                iterator += 1

            if wandb.run is not None:
                wandb.finish()

            time.sleep(3)


In [14]:
# Comparison with RouterDC
DATA_DIR = f"{PROJECT_ROOT_PATH}/data/routerdc_raw"
routerdc_logs = download_log_data(
    entity="tum-i13",
    project_name="routerdc-sweep",
    save_dir=DATA_DIR,
    batch_size=50
)

routerdc_logs = load_all_histories_to_dataframe(DATA_DIR)

In [15]:
routerdc_logs["benchmark"] = routerdc_logs["run_name"].str.replace("routerdc-", "")
BENCHMARKS = routerdc_logs["benchmark"].unique().tolist()

print(BENCHMARKS)

MODEL_CHOICE_DICT = {
    2: "large",
    1: "medium",
    0: "small",
}

for benchmark in BENCHMARKS:
    # Load benchmark config
    config_path = Path(f"{PROJECT_ROOT_PATH}/config/online/{benchmark}.yaml")
    NUM_PRETRAINING_STEPS = 0

    with config_path.open("r") as f:
        CONFIG = yaml.safe_load(f)

    algorithm_config = CONFIG["algorithm"]

    try:
        benchmark_raw_df = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/inference_outputs/{benchmark}")
    except ValueError:
	    continue

    display(f"BENCHMARK: {benchmark} - {benchmark_raw_df['label_large'].mean()}")

    for alpha in algorithm_config["alpha_values"]:

        run = wandb.init(
            project="routerdc_baseline_v01",
            name=f"{benchmark}-alpha={alpha}",
            config=CONFIG
        )

        MODEL_CHOICES = []
        ACCURACY_LIST = []
        Q = 0.0

        iterator = 0
        for idx, row in routerdc_logs.loc[(routerdc_logs["benchmark"] == benchmark)].iterrows():
            statistics_dict = {}
            model_choice_id = row["model_choice"]

            # Get accuracy from raw evaluations
            chosen_model = MODEL_CHOICE_DICT[model_choice_id]
            row_benchmark_data = benchmark_raw_df.loc[(benchmark_raw_df.index == row["document_id"])]

            result_data = row_benchmark_data[f"label_{chosen_model}"].values
            try:
                result = result_data[0]
            except IndexError as e:
                print(f"Error encountered for {benchmark}: #{iterator}")
                continue

            MODEL_CHOICES.append(model_choice_id)
            ACCURACY_LIST.append(result)
            x = np.array(MODEL_CHOICES)
            Q = max(0.0, Q + alpha - result)

            total_energy = row_benchmark_data[f"energy_consumption_{chosen_model}"].values[0] + row["energy"]

            statistics_dict["model_choice"] = model_choice_id
            statistics_dict["avg_accuracy"] = sum(ACCURACY_LIST) / (iterator + 1)
            statistics_dict["models/small_chosen"] = len(np.where(x == 0)[0]) / (iterator + 1)
            statistics_dict["models/medium_chosen"] = len(np.where(x == 1)[0]) / (iterator + 1)
            statistics_dict["models/large_chosen"] = len(np.where(x == 2)[0]) / (iterator + 1)
            statistics_dict["mess_plus/energy"] = total_energy
            statistics_dict["routellm/router_energy"] = row["energy"]
            statistics_dict["routellm/inference_energy"] = row_benchmark_data[f"energy_consumption_{chosen_model}"].values[0]
            statistics_dict["mess_plus/q_length"] = Q

            if wandb.run is not None:
                wandb.log(statistics_dict, step=iterator)

            iterator += 1

        if wandb.run is not None:
            wandb.finish()

        time.sleep(3)
