In [29]:
import pandas as pd
import numpy as np

from classifier.file_reader import read_files_from_folder

from pathlib import Path


PROJECT_ROOT_PATH = Path("dataset_merger.ipynb").parent
BASE_DATASET_PATH = Path(f"{PROJECT_ROOT_PATH}/data").absolute()

BENCHMARKS = [
	"arc_challenge",
	"arc_easy",
	"boolq",
	"logiqa",
	"piqa",
	"sciq",
	"social_iqa",
	"winogrande"
]

BASE_COLS = [
	"input_text",
	"benchmark_name",
]

LLAMA3_COLS = ["small", "medium"]
QWEN2_COLS = ["xsmall", "large"]



In [30]:
# for benchmark in BENCHMARKS:
# 	bm_subset1 = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/llama3/inference_outputs/{benchmark}")
# 	bm_subset2 = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/qwen2/inference_outputs/{benchmark}")
#
# 	cols_a = bm_subset1.columns.to_list()
# 	cols_b = bm_subset2.columns.to_list()
#
# 	base_cols = [col for col in cols_a if col in BASE_COLS]
# 	cols_a_adj = []
# 	for col in cols_a:
# 		for selected_col in LLAMA3_COLS:
# 			if selected_col in col:
# 				cols_a_adj.append(col)
#
# 	cols_b_adj = []
# 	for col in cols_b:
# 		for selected_col in QWEN2_COLS:
# 			if selected_col in col:
# 				cols_b_adj.append(col)
#
# 	cols_b_adj = base_cols + cols_b_adj
#
# 	merged_df = bm_subset2[cols_b_adj].merge(
# 		bm_subset1[cols_a_adj],
# 		left_index=True,
# 		right_index=True,
# 		how="inner"
# 	)
#
# 	benchmark_base_path = Path(f"{BASE_DATASET_PATH}/llama3_qwen2_mix/inference_outputs/{benchmark}")
# 	benchmark_base_path.mkdir(parents=True, exist_ok=True)
# 	merged_df.to_csv(f"{benchmark_base_path}/all_data.csv")


In [31]:
# df_test = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/llama3_qwen2_mix/inference_outputs/arc_challenge")
# display(df_test)

In [36]:
# Establishing different cost ratios and spreads.

POWER_SCALING_FACTOR = 0.3
OUTPUT_FOLDER = Path(f"{BASE_DATASET_PATH}/qwen2_narrow_cost_spread/inference_outputs")
LABELS = ["xsmall", "small", "medium", "large"]

stats_df = pd.DataFrame({
	"benchmark": [],
	"model": [],
	"observation_point": [],
	"cost_scale": []
})
for benchmark in BENCHMARKS:

	print(f"=======    {benchmark}    =======")
	cs_set = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/qwen2/inference_outputs/{benchmark}")
	means = {}
	for label in LABELS:
		means[label] = cs_set[f"energy_consumption_{label}"].mean()

	global_mean = np.mean(list(means.values()))

	ratios = {}
	for label in LABELS:
		ratios[label] = means[label] / global_mean
		data = {
			"benchmark": benchmark,
			"model": label,
			"observation_point": "original",
			"cost_scale": ratios[label]
		}

		stats_df = pd.concat([stats_df, pd.DataFrame(data, index=[0])], ignore_index=True)
		stats_df.reset_index(drop=True, inplace=True)

	for label in LABELS:
		cs_set[f"energy_consumption_{label}"] = cs_set[f"energy_consumption_{label}"] * (ratios[label] ** (POWER_SCALING_FACTOR - 1))

	means = {}
	for label in LABELS:
		means[label] = cs_set[f"energy_consumption_{label}"].mean()

	global_mean = np.mean(list(means.values()))

	ratios = {}
	for label in LABELS:
		ratios[label] = means[label] / global_mean
		data = {
			"benchmark": benchmark,
			"model": label,
			"observation_point": "scaled",
			"cost_scale": ratios[label]
		}

		stats_df = pd.concat([stats_df, pd.DataFrame(data, index=[0])], ignore_index=True)
		stats_df.reset_index(drop=True, inplace=True)

	save_path = Path(f"{OUTPUT_FOLDER}/{benchmark}")
	save_path.mkdir(parents=True, exist_ok=True)
	cs_set.sort_index(inplace=True)
	cs_set.to_csv(f"{OUTPUT_FOLDER}/{benchmark}/all_data.csv")

display(stats_df)



Unnamed: 0,benchmark,model,observation_point,cost_scale
0,arc_challenge,xsmall,original,0.215933
1,arc_challenge,small,original,0.287896
2,arc_challenge,medium,original,0.654523
3,arc_challenge,large,original,2.841648
4,arc_challenge,xsmall,scaled,0.707788
...,...,...,...,...
59,winogrande,large,original,2.647747
60,winogrande,xsmall,scaled,0.701364
61,winogrande,small,scaled,0.758810
62,winogrande,medium,scaled,1.055860
