In [2]:
import json
import numpy as np
import os

def print_tex_result(results_file, primary_metrics, percent_metrics):
    with open(results_file, 'r') as f:
        results = json.load(f)

    task_metrics = {}
    for task, metric_name in primary_metrics.items():
        if task in results and metric_name in results[task]:
            value = results[task][metric_name]
            if metric_name in percent_metrics:
                task_metrics[task] = value * 100
            else:
                task_metrics[task] = value
        else:
            task_metrics[task] = np.nan
    return task_metrics

In [24]:
results_dir = "evaluations_final/hetero-3B-final-468/20"
mode = "hetero"  # Change this to "none", "homo", or "hetero"
num_clients = 8 # Define the number of clients

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "accuracy",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "accuracy"
}
percent_metrics = {"accuracy", "f1_score", "rougeL"}
latex_order = list(primary_metrics.keys())

if mode == "homo":
    print(f"Mode: {mode}")
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    try:
        metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        latex_row = " & ".join([f"{metrics.get(task, np.nan):.2f}" for task in latex_order]) + " \\\\"
        print(latex_row)
    except FileNotFoundError:
        print(f"Error: Results file not found at {results_path}")

elif mode == "none":
    print(f"Mode: {mode}")
    all_rows = []
    # 1) load everything
    for i in range(num_clients):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        try:
            task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
            all_rows.append([task_metrics.get(task, np.nan) for task in latex_order])
        except FileNotFoundError:
            print(f"Warning: Results file not found for client {i} at {results_path}. Skipping client.")
            all_rows.append([np.nan] * len(latex_order)) # Add row of NaNs

    if not all_rows:
        print("Error: No client data loaded.")
    else:
        all_array   = np.array(all_rows)
        best_values = np.nanmax(all_array, axis=0)
        avg_values  = np.nanmean(all_array, axis=0)

        # 2) print per‐client rows, bolding only the column‐wise maxima
        for i, row_vals in enumerate(all_rows):
            cells = []
            for idx, val in enumerate(row_vals):
                # Check if val is NaN before comparison
                if not np.isnan(val) and np.isclose(val, best_values[idx]):
                    cells.append(f"\\textbf{{{val:.2f}}}")
                elif np.isnan(val):
                     cells.append(f"NaN") # Or some other placeholder
                else:
                    cells.append(f"{val:.2f}")
            print(f"{i} & " + " & ".join(cells) + " \\\\")

        # 3) average row
        print('\\cline{2-10}') # Adjust column range if needed
        avg_cells = [f"{v:.2f}" if not np.isnan(v) else "NaN" for v in avg_values]
        print(f"none (avg) & " + " & ".join(avg_cells) + " \\\\")

        # 4) best row (still bold)
        best_cells = [f"{v:.2f}" if not np.isnan(v) else "NaN" for v in best_values]
        print(f"none (best) & " + " & ".join(best_cells) + " \\\\")

elif mode == "hetero":
    print(f"Mode: {mode}")
    hetero_metrics = {}
    valid_clients_count = 0
    # Load metrics for each client, assuming client i corresponds to task i in latex_order
    # This interpretation might need adjustment based on your exact setup.
    if len(latex_order) != num_clients:
         print(f"Warning: Number of tasks ({len(latex_order)}) does not match number of clients ({num_clients}). Diagonal logic might be incorrect.")

    for i in range(num_clients):
        task = latex_order[i] # Task corresponding to this client index
        results_path = os.path.join(results_dir, f"client_{i}_hetlora_output_metrics.json")
        try:
            client_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
            # Get the primary metric for the specific task 'task' from this client's results
            hetero_metrics[task] = client_metrics.get(task, np.nan)
            if not np.isnan(hetero_metrics[task]):
                 valid_clients_count += 1
        except FileNotFoundError:
            print(f"Warning: Results file not found for client {i} at {results_path}. Setting metric for task '{task}' to NaN.")
            hetero_metrics[task] = np.nan

    if valid_clients_count == 0:
        print("Error: No HetLoRA client data loaded.")
    else:
        # Format the 'diagonal' metrics into a single row, similar to 'homo' mode
        latex_row = " & ".join([f"{hetero_metrics.get(t, np.nan):.2f}" for t in latex_order]) + " \\\\"
        print(latex_row)

else:
    print(f"Error: Invalid mode '{mode}'. Choose 'none', 'homo', or 'hetero'.")

Mode: hetero
59.00 & 65.50 & 75.50 & 68.00 & 86.00 & 51.48 & 92.92 & 62.00 \\


In [None]:
import os
import numpy as np

# Two result directories to compare
results_dir1 = "evaluations_midway/none-1B-midway/20"
results_dir2 = "evaluations_midway/none-3B-midway/20"

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "accuracy",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "accuracy"
}
percent_metrics = {"accuracy", "f1_score", "rougeL"}
latex_order = list(primary_metrics.keys())

# 1) Gather all client scores for each directory
all_rows1 = []
all_rows2 = []
for i in range(8):
    path1 = os.path.join(results_dir1, f"client_{i}_output_metrics.json")
    path2 = os.path.join(results_dir2, f"client_{i}_output_metrics.json")

    m1 = print_tex_result(path1, primary_metrics, percent_metrics)
    m2 = print_tex_result(path2, primary_metrics, percent_metrics)

    all_rows1.append([m1[task] for task in latex_order])
    all_rows2.append([m2[task] for task in latex_order])

all_array1 = np.array(all_rows1)
all_array2 = np.array(all_rows2)

# 2) Compute per-column bests for each directory separately
best1 = np.nanmax(all_array1, axis=0)
best2 = np.nanmax(all_array2, axis=0)

# 3) Print each client row, bolding entries that match the best within their own dir
for i in range(8):
    cells = []
    for idx, task in enumerate(latex_order):
        v1 = all_rows1[i][idx]
        v2 = all_rows2[i][idx]

        # bold if this client has the max for that column in its directory
        if np.isclose(v1, best1[idx]):
            s1 = f"\\textbf{{{v1:.2f}}}"
        else:
            s1 = f"{v1:.2f}"

        if np.isclose(v2, best2[idx]):
            s2 = f"\\textbf{{{v2:.2f}}}"
        else:
            s2 = f"{v2:.2f}"

        cells.append(f"{s1}/{s2}")

    row_str = " & ".join(cells)
    print(f"{i} & {row_str} \\\\")


0 & \textbf{54.50}/55.50 & 34.00/35.50 & \textbf{64.50}/50.50 & 58.30/55.76 & 30.50/35.00 & 29.68/37.38 & 68.73/85.39 & 47.00/50.50 \\
1 & 53.00/56.00 & \textbf{35.00}/37.50 & \textbf{64.50}/41.50 & 68.94/67.94 & 29.00/36.50 & 29.09/36.89 & 72.77/86.59 & 49.50/50.50 \\
2 & 53.50/\textbf{57.50} & 34.50/43.00 & 64.00/\textbf{57.50} & \textbf{75.33}/\textbf{75.00} & 28.00/40.00 & 30.76/35.33 & 69.63/85.85 & 49.50/\textbf{51.00} \\
3 & 53.50/55.00 & 34.50/36.50 & \textbf{64.50}/49.50 & 23.75/47.52 & 32.50/37.00 & 28.69/35.55 & 72.76/85.72 & 47.00/\textbf{51.00} \\
4 & 51.50/53.50 & 34.50/\textbf{43.50} & \textbf{64.50}/49.50 & 60.41/70.04 & \textbf{39.00}/\textbf{45.00} & 29.68/36.08 & 68.10/84.64 & 47.00/\textbf{51.00} \\
5 & 50.00/34.50 & 28.50/0.00 & 44.00/0.00 & 14.15/0.00 & 11.00/0.50 & \textbf{33.84}/\textbf{43.04} & 72.56/85.34 & 15.00/10.00 \\
6 & 46.00/36.50 & 23.50/0.00 & 33.00/0.00 & 12.21/2.96 & 14.00/2.00 & 33.54/39.66 & \textbf{89.18}/\textbf{90.77} & 12.00/17.00 \\
7 & 54.00

In [4]:
# for appendix A

results_dir = "evaluations_midway/hetero-3B-midway/20"
is_global_model = True

primary_metrics = {
    "coreference": "rouge1",
    "entailment": "rouge1",
    "linguistic_acceptability": "rouge1",
    "paraphrase": "rouge1",
    "question_classification": "rouge1",
    "structure_to_text": "rouge1",
    "text_formatting": "rouge1",
    "word_disambiguation": "rouge1"
}
percent_metrics = {"accuracy", "f1_score", "rougeL", "rouge1"}
latex_order = list(primary_metrics.keys())

if is_global_model:
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
    latex_row = " & ".join([f"{metrics[task]:.2f}" for task in latex_order]) + " & " + f"{sum(metrics.values()) / len(metrics):.2f}" + " \\\\"
    print(latex_row)

else:
    all_rows = []
    # 1) load everything
    for i in range(8):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        all_rows.append([task_metrics[task] for task in latex_order])

    all_array   = np.array(all_rows)
    best_values = np.nanmax(all_array, axis=0)
    avg_values  = np.nanmean(all_array, axis=0)

    # 2) print per‐client rows, bolding only the column‐wise maxima
    for i, row_vals in enumerate(all_rows):
        cells = []
        for val, best in zip(row_vals, best_values):
            if np.isclose(val, best):
                cells.append(f"\\textbf{{{val:.2f}}}")
            else:
                cells.append(f"{val:.2f}")
        print(f"& {i} & " + " & ".join(cells) + " \\\\")

    # 3) average row
    print('\\cline{2-10}')
    avg_cells = [f"{v:.2f}" for v in avg_values]
    print(f"& none (avg) & " + " & ".join(avg_cells) + " \\\\")

    # 4) best row (still bold)
    best_cells = [f"{v:.2f}" for v in best_values]
    print(f"& none (best) & " + " & ".join(best_cells) + " \\\\")

61.23 & 35.00 & 60.00 & 57.50 & 73.50 & 52.11 & 82.29 & 50.50 & 59.02 \\
