In [None]:
import pandas as pd
import numpy as np

from classifier.file_reader import read_files_from_folder

from pathlib import Path

PROJECT_ROOT_PATH = Path("dataset_merger.ipynb").parent
BASE_DATASET_PATH = Path(f"{PROJECT_ROOT_PATH}/data").absolute()

BENCHMARKS = [
	"arc_challenge",
	"arc_easy",
	"boolq",
	"logiqa",
	"piqa",
	"sciq",
	"social_iqa",
	"winogrande"
]

LLAMA3_COLS = ["_small", "_medium"]
QWEN2_COLS = ["_xsmall", "_large"]

OUTPUT_FOLDER = Path(f"{BASE_DATASET_PATH}/llama3_qwen2_mix/inference_outputs")


In [None]:
for benchmark in BENCHMARKS:
    llama3_set = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/llama3/inference_outputs/{benchmark}")
    qwen2_set = read_files_from_folder(f"{PROJECT_ROOT_PATH}/data/qwen2/inference_outputs/{benchmark}")

    LLAMA3_COLS = ["_small", "_medium"]
    QWEN2_COLS = ["_xsmall", "_large"]
    base_cols = ["input_text", "benchmark_name"]

    # Get relevant llama columns
    llama3_set_cols = llama3_set.columns.to_list()
    relevant_llama_cols = base_cols.copy()
    for llama_col_name in llama3_set_cols:
       for model_size in LLAMA3_COLS:
          if model_size in llama_col_name:
             relevant_llama_cols.append(llama_col_name)

    # Get relevant qwen columns
    qwen2_set_cols = qwen2_set.columns.to_list()
    relevant_qwen_cols = []
    for qwen_col_name in qwen2_set_cols:
       for model_size in QWEN2_COLS:
          if model_size in qwen_col_name:
             relevant_qwen_cols.append(qwen_col_name)

    display(llama3_set_cols)
    display(qwen2_set_cols)

    print(f"Relevant llama cols: {relevant_llama_cols}")
    print(f"Relevant qwen cols: {relevant_qwen_cols}")

    subset_llama = llama3_set[relevant_llama_cols]
    subset_qwen = qwen2_set[relevant_qwen_cols]

    subset_llama.sort_index(inplace=True)
    subset_qwen.sort_index(inplace=True)

    cs_set = subset_llama.join(subset_qwen, how='inner')

    save_path = Path(f"{OUTPUT_FOLDER}/{benchmark}")
    save_path.mkdir(parents=True, exist_ok=True)
    cs_set.sort_index(inplace=True)
    cs_set.to_csv(f"{OUTPUT_FOLDER}/{benchmark}/all_data.csv")