In [2]:
import json
import numpy as np
import os

def print_tex_result(results_file, primary_metrics, percent_metrics):
    with open(results_file, 'r') as f:
        results = json.load(f)

    task_metrics = {}
    for task, metric_name in primary_metrics.items():
        if task in results and metric_name in results[task]:
            value = results[task][metric_name]
            if metric_name in percent_metrics:
                task_metrics[task] = value * 100
            else:
                task_metrics[task] = value
        else:
            task_metrics[task] = np.nan
    return task_metrics

In [2]:
results_dir = "evaluations_final/hetero-3B-optimal-426/20/"
mode = "hetero-p"  # Change this to "none", "homo", "hetero-g", "hetero-d" or "hetero-p"

In [5]:
num_clients = 8 # Define the number of clients

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "accuracy",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "accuracy"
}
percent_metrics = {"accuracy", "f1_score", "rougeL"}
latex_order = list(primary_metrics.keys())

if mode == "homo":
    print(f"Mode: {mode}")
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    try:
        metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        latex_row = " & ".join([f"{metrics.get(task, np.nan):.2f}" for task in latex_order]) + " \\\\"
        print(latex_row)
    except FileNotFoundError:
        print(f"Error: Results file not found at {results_path}")

elif mode == "none":
    print(f"Mode: {mode}")
    all_rows = []
    # 1) load everything
    for i in range(num_clients):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        try:
            task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
            all_rows.append([task_metrics.get(task, np.nan) for task in latex_order])
        except FileNotFoundError:
            print(f"Warning: Results file not found for client {i} at {results_path}. Skipping client.")
            all_rows.append([np.nan] * len(latex_order)) # Add row of NaNs

    if not all_rows:
        print("Error: No client data loaded.")
    else:
        all_array   = np.array(all_rows)
        best_values = np.nanmax(all_array, axis=0)
        avg_values  = np.nanmean(all_array, axis=0)

        # 2) print per‐client rows, bolding only the column‐wise maxima
        for i, row_vals in enumerate(all_rows):
            cells = []
            for idx, val in enumerate(row_vals):
                # Check if val is NaN before comparison
                if not np.isnan(val) and np.isclose(val, best_values[idx]):
                    cells.append(f"\\textbf{{{val:.2f}}}")
                elif np.isnan(val):
                     cells.append(f"NaN") # Or some other placeholder
                else:
                    cells.append(f"{val:.2f}")
            print(f"{i} & " + " & ".join(cells) + " \\\\")

        # 3) average row
        print('\\cline{2-10}') # Adjust column range if needed
        avg_cells = [f"{v:.2f}" if not np.isnan(v) else "NaN" for v in avg_values]
        print(f"none (avg) & " + " & ".join(avg_cells) + " \\\\")

        # 4) best row (still bold)
        best_cells = [f"{v:.2f}" if not np.isnan(v) else "NaN" for v in best_values]
        print(f"none (best) & " + " & ".join(best_cells) + " \\\\")

elif mode == "hetero-g":
    print(f"Mode: {mode}")
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    try:
        metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        latex_row = " & ".join([f"{metrics.get(task, np.nan):.2f}" for task in latex_order]) + " \\\\"
        print(latex_row)
    except FileNotFoundError:
        print(f"Error: Results file not found at {results_path}")

elif mode == "hetero-d":
    print(f"Mode: {mode}")
    hetero_metrics = {}
    valid_clients_count = 0
    # Load metrics for each client, assuming client i corresponds to task i in latex_order
    # This interpretation might need adjustment based on your exact setup.
    if len(latex_order) != num_clients:
         print(f"Warning: Number of tasks ({len(latex_order)}) does not match number of clients ({num_clients}). Diagonal logic might be incorrect.")

    for i in range(num_clients):
        task = latex_order[i] # Task corresponding to this client index
        results_path = os.path.join(results_dir, f"client_{i}_hetlora_output_metrics.json")
        try:
            client_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
            # Get the primary metric for the specific task 'task' from this client's results
            hetero_metrics[task] = client_metrics.get(task, np.nan)
            if not np.isnan(hetero_metrics[task]):
                 valid_clients_count += 1
        except FileNotFoundError:
            print(f"Warning: Results file not found for client {i} at {results_path}. Setting metric for task '{task}' to NaN.")
            hetero_metrics[task] = np.nan

    if valid_clients_count == 0:
        print("Error: No HetLoRA client data loaded.")
    else:
        # Format the 'diagonal' metrics into a single row, similar to 'homo' mode
        latex_row = " & ".join([f"{hetero_metrics.get(t, np.nan):.2f}" for t in latex_order]) + " \\\\"
        print(latex_row)

elif mode == "hetero-p":
    print(f"Mode: {mode}")
    hetero_metrics = {}
    valid_clients_count = 0
    # Load metrics for each client, assuming client i corresponds to task i in latex_order
    # This interpretation might need adjustment based on your exact setup.
    if len(latex_order) != num_clients:
         print(f"Warning: Number of tasks ({len(latex_order)}) does not match number of clients ({num_clients}). Diagonal logic might be incorrect.")

    for i in range(num_clients):
        task = latex_order[i] # Task corresponding to this client index
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        try:
            client_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
            # Get the primary metric for the specific task 'task' from this client's results
            hetero_metrics[task] = client_metrics.get(task, np.nan)
            if not np.isnan(hetero_metrics[task]):
                 valid_clients_count += 1
        except FileNotFoundError:
            print(f"Warning: Results file not found for client {i} at {results_path}. Setting metric for task '{task}' to NaN.")
            hetero_metrics[task] = np.nan

    if valid_clients_count == 0:
        print("Error: No HetLoRA client data loaded.")
    else:
        # Format the 'diagonal' metrics into a single row, similar to 'homo' mode
        latex_row = " & ".join([f"{hetero_metrics.get(t, np.nan):.2f}" for t in latex_order]) + " \\\\"
        print(latex_row)

else:
    print(f"Error: Invalid mode '{mode}'. Choose 'none', 'homo', or 'hetero'.")

Mode: hetero-p
57.00 & 76.50 & 63.50 & 57.50 & 85.50 & 51.98 & 94.65 & 56.50 \\


In [3]:
import os
import numpy as np

# Two result directories to compare
results_dir1 = "evaluations_final/none-homo-1B-426/20"
results_dir2 = "evaluations_final/none-homo-3B-426/20"

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "accuracy",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "accuracy"
}
percent_metrics = {"accuracy", "f1_score", "rougeL"}
latex_order = list(primary_metrics.keys())

# 1) Gather all client scores for each directory
all_rows1 = []
all_rows2 = []
for i in range(8):
    path1 = os.path.join(results_dir1, f"client_{i}_output_metrics.json")
    path2 = os.path.join(results_dir2, f"client_{i}_output_metrics.json")

    m1 = print_tex_result(path1, primary_metrics, percent_metrics)
    m2 = print_tex_result(path2, primary_metrics, percent_metrics)

    all_rows1.append([m1[task] for task in latex_order])
    all_rows2.append([m2[task] for task in latex_order])

all_array1 = np.array(all_rows1)
all_array2 = np.array(all_rows2)

# 2) Compute per-column bests for each directory separately
best1 = np.nanmax(all_array1, axis=0)
best2 = np.nanmax(all_array2, axis=0)

# 3) Print each client row, bolding entries that match the best within their own dir
for i in range(8):
    cells = []
    for idx, task in enumerate(latex_order):
        v1 = all_rows1[i][idx]
        v2 = all_rows2[i][idx]

        # bold if this client has the max for that column in its directory
        if np.isclose(v1, best1[idx]):
            s1 = f"\\textbf{{{v1:.2f}}}"
        else:
            s1 = f"{v1:.2f}"

        if np.isclose(v2, best2[idx]):
            s2 = f"\\textbf{{{v2:.2f}}}"
        else:
            s2 = f"{v2:.2f}"

        cells.append(f"{s1}/{s2}")

    row_str = " & ".join(cells)
    print(f"{i} & {row_str} \\\\")

# Add a horizontal line
print("\\hline")

# Calculate and add the average row
avg1 = np.nanmean(all_array1, axis=0)
avg2 = np.nanmean(all_array2, axis=0)

avg_cells = []
for idx, task in enumerate(latex_order):
    avg_v1 = avg1[idx]
    avg_v2 = avg2[idx]
    avg_cells.append(f"{avg_v1:.2f}/{avg_v2:.2f}")

avg_row_str = " & ".join(avg_cells)
print(f"Avg & {avg_row_str} \\\\")

0 & \textbf{53.00}/\textbf{73.00} & 37.00/32.50 & 62.00/65.00 & 48.00/41.00 & 17.50/57.00 & 28.33/37.47 & 31.36/75.88 & 48.00/53.00 \\
1 & 52.00/55.50 & \textbf{76.00}/\textbf{86.50} & 61.50/64.50 & 35.50/38.50 & 6.50/53.00 & 29.81/32.90 & 44.38/71.46 & 42.00/48.50 \\
2 & \textbf{53.00}/62.00 & 38.50/52.50 & 51.50/\textbf{78.50} & 53.00/44.00 & 30.00/43.50 & 26.02/34.22 & 42.14/80.94 & 33.00/38.00 \\
3 & 45.00/55.00 & 34.50/48.50 & 57.00/65.50 & \textbf{69.50}/\textbf{70.00} & 21.50/44.50 & 25.68/36.78 & 40.09/77.83 & 28.50/55.00 \\
4 & \textbf{53.00}/56.00 & 36.00/40.50 & \textbf{64.50}/65.00 & 38.00/37.00 & \textbf{86.00}/\textbf{91.50} & 21.70/34.51 & 43.09/77.93 & 38.00/48.50 \\
5 & 3.50/23.00 & 0.50/23.00 & 0.00/24.00 & 0.50/7.00 & 0.00/16.00 & \textbf{51.08}/\textbf{53.84} & 61.16/75.13 & 0.50/21.00 \\
6 & 0.00/4.50 & 0.00/13.00 & 0.00/14.50 & 0.00/12.00 & 0.00/22.00 & 40.76/41.18 & \textbf{93.70}/\textbf{96.31} & 0.00/35.00 \\
7 & 50.00/53.00 & 37.00/36.00 & 60.50/61.00 & 40.00/

In [6]:
results_dir = "evaluations_final/none-hetero-3B-426/20"
is_global_model = False

primary_metrics = {
    "coreference": "accuracy",
    "entailment": "accuracy",
    "linguistic_acceptability": "accuracy",
    "paraphrase": "accuracy",
    "question_classification": "accuracy",
    "structure_to_text": "rougeL",
    "text_formatting": "rougeL",
    "word_disambiguation": "accuracy"
}
percent_metrics = {"accuracy", "f1_score", "rougeL", "rouge1"}
latex_order = list(primary_metrics.keys())

if is_global_model:
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
    latex_row = " & ".join([f"{metrics[task]:.2f}" for task in latex_order]) + " & " + f"{sum(metrics.values()) / len(metrics):.2f}" + " \\\\"
    print(latex_row)

else:
    all_rows = []
    # 1) load everything
    for i in range(8):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        all_rows.append([task_metrics[task] for task in latex_order])

    all_array   = np.array(all_rows)
    best_values = np.nanmax(all_array, axis=0)
    avg_values  = np.nanmean(all_array, axis=0)

    # 2) print per‐client rows, bolding only the column‐wise maxima
    for i, row_vals in enumerate(all_rows):
        cells = []
        for val, best in zip(row_vals, best_values):
            if np.isclose(val, best):
                cells.append(f"\\textbf{{{val:.2f}}}")
            else:
                cells.append(f"{val:.2f}")
        print(f"& {i} & " + " & ".join(cells) + " \\\\")

    # 3) average row
    print('\\cline{2-10}')
    avg_cells = [f"{v:.2f}" for v in avg_values]
    print(f"& none (avg) & " + " & ".join(avg_cells) + " \\\\")

    # 4) best row (still bold)
    best_cells = [f"{v:.2f}" for v in best_values]
    print(f"& none (best) & " + " & ".join(best_cells) + " \\\\")

FileNotFoundError: [Errno 2] No such file or directory: 'evaluations_final/none-hetero-3B-426/20/client_0_output_metrics.json'

In [15]:
# for appendix A in domain

results_dir = "evaluations_final/none-homo-3B-426/20/"
mode = "hetero-p"  # Change this to "none", "homo", "hetero-g", "hetero-d" or "hetero-p"

num_clients = 8 # Define the number of clients

primary_metrics = {
    "coreference": "rouge1",
    "entailment": "rouge1",
    "linguistic_acceptability": "rouge1",
    "paraphrase": "rouge1",
    "question_classification": "rouge1",
    "structure_to_text": "rouge1",
    "text_formatting": "rouge1",
    "word_disambiguation": "rouge1"
}
percent_metrics = {"accuracy", "f1_score", "rouge1"}
latex_order = list(primary_metrics.keys())

hetero_metrics = {}
valid_clients_count = 0
# Load metrics for each client, assuming client i corresponds to task i in latex_order
# This interpretation might need adjustment based on your exact setup.
if len(latex_order) != num_clients:
        print(f"Warning: Number of tasks ({len(latex_order)}) does not match number of clients ({num_clients}). Diagonal logic might be incorrect.")

for i in range(num_clients):
    task = latex_order[i] # Task corresponding to this client index
    results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
    try:
        client_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        # Get the primary metric for the specific task 'task' from this client's results
        hetero_metrics[task] = client_metrics.get(task, np.nan)
        if not np.isnan(hetero_metrics[task]):
                valid_clients_count += 1
    except FileNotFoundError:
        print(f"Warning: Results file not found for client {i} at {results_path}. Setting metric for task '{task}' to NaN.")
        hetero_metrics[task] = np.nan

if valid_clients_count == 0:
    print("Error: No HetLoRA client data loaded.")
else:
    # Format the 'diagonal' metrics into a single row, similar to 'homo' mode
    row_values = [hetero_metrics.get(t, np.nan) for t in latex_order]
    latex_row = " & ".join([f"{v:.2f}" for v in row_values])

    # Compute average (ignoring NaN)
    avg = np.nanmean(row_values)

    # Append average
    latex_row += f" & {avg:.2f} \\\\"
    print(latex_row)

75.50 & 86.50 & 78.50 & 70.00 & 91.50 & 68.32 & 96.29 & 55.75 & 77.79 \\


In [16]:
# for appendix A

results_dir = "evaluations_final/homo-3B-r4/20"
is_global_model = True

primary_metrics = {
    "coreference": "rouge1",
    "entailment": "rouge1",
    "linguistic_acceptability": "rouge1",
    "paraphrase": "rouge1",
    "question_classification": "rouge1",
    "structure_to_text": "rouge1",
    "text_formatting": "rouge1",
    "word_disambiguation": "rouge1"
}
percent_metrics = {"accuracy", "f1_score", "rougeL", "rouge1"}
latex_order = list(primary_metrics.keys())

if is_global_model:
    results_path = os.path.join(results_dir, "global_output_metrics.json")
    metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
    latex_row = " & ".join([f"{metrics[task]:.2f}" for task in latex_order]) + " & " + f"{sum(metrics.values()) / len(metrics):.2f}" + " \\\\"
    print(latex_row)

else:
    all_rows = []
    # 1) load everything
    for i in range(8):
        results_path = os.path.join(results_dir, f"client_{i}_output_metrics.json")
        task_metrics = print_tex_result(results_path, primary_metrics, percent_metrics)
        all_rows.append([task_metrics[task] for task in latex_order])

    all_array   = np.array(all_rows)
    best_values = np.nanmax(all_array, axis=0)
    avg_values  = np.nanmean(all_array, axis=0)

    # 2) print per‐client rows, bolding only the column‐wise maxima
    for i, row_vals in enumerate(all_rows):
        cells = []
        for val, best in zip(row_vals, best_values):
            if np.isclose(val, best):
                cells.append(f"\\textbf{{{val:.2f}}}")
            else:
                cells.append(f"{val:.2f}")
        print(f"& {i} & " + " & ".join(cells) + " \\\\")

    # 3) average row
    print('\\cline{2-10}')
    avg_cells = [f"{v:.2f}" for v in avg_values]
    avg_cells.append(f"{sum(avg_values) / len(avg_values):.2f}")
    print(f"& none (avg) & " + " & ".join(avg_cells) + " \\\\")

    # 4) best row (still bold)
    best_cells = [f"{v:.2f}" for v in best_values]
    best_cells.append(f"{sum(best_values) / len(best_values):.2f}")
    print(f"& none (best) & " + " & ".join(best_cells) + " \\\\")

67.20 & 70.00 & 77.00 & 68.50 & 85.50 & 65.16 & 93.38 & 61.50 & 73.53 \\
