In [None]:
from paretoGraphTeams import *

import shutil
import matplotlib as mpl

# Enable LaTeX rendering if available (fallback to Matplotlib text otherwise)
if shutil.which("latex"):
    mpl.rcParams.update({
        "text.usetex": True,
        "font.family": "serif",
        "text.latex.preamble": r"\usepackage{amsmath}\usepackage{amssymb}",
    })
else:
    mpl.rcParams.update({"text.usetex": False})

# Import datasets
# IMDB
imdb_experts_1, imdb_tasks_1, imdb_costs_1, imdb_graphmat_1 = import_pickled_datasets('imdb', 1)
imdb_experts_2, imdb_tasks_2, imdb_costs_2, imdb_graphmat_2 = import_pickled_datasets('imdb', 2)
# imdb_experts_3, imdb_tasks_3, imdb_costs_3, imdb_graphmat_3 = import_pickled_datasets('imdb', 3)

# Bibsonomy
bbsm_experts_1, bbsm_tasks_1, bbsm_costs_1, bbsm_graphmat_1 = import_pickled_datasets('bbsm', 1)
# bbsm_experts_2, bbsm_tasks_2, bbsm_costs_2, bbsm_graphmat_2 = import_pickled_datasets('bbsm', 2)

# Freelancer
fl_experts_1, fl_tasks_1, fl_costs_1, fl_graphmat_1 = import_pickled_datasets('freelancer', 1)
# fl_experts_2, fl_tasks_2, fl_costs_2, fl_graphmat_2 = import_pickled_datasets('freelancer', 2)

In [None]:
# Randomize distance matrices (symmetric, zero diagonal)
rng = np.random.default_rng(42)

def _randomize_graphmat(mat):
    n = mat.shape[0]
    rand = rng.uniform(0.1, 1.0, size=(n, n))
    rand = (rand + rand.T) / 2.0
    np.fill_diagonal(rand, 0.0)
    return rand

imdb_graphmat_1 = _randomize_graphmat(imdb_graphmat_1)
imdb_graphmat_2 = _randomize_graphmat(imdb_graphmat_2)
bbsm_graphmat_1 = _randomize_graphmat(bbsm_graphmat_1)
fl_graphmat_1 = _randomize_graphmat(fl_graphmat_1)

In [None]:
imdb_graphmat_1[0]

### Average Plotting across Tasks

In [None]:
def findApproximateParetoSolutions(tasks_list, experts_list, graphmat,
                                   sizeUniverse, numExperts, numTasks, maxDiameter,
                                   dataset_name=None, start_index=0):
    '''
    Run graph-diameter algorithm over multiple tasks, aggregate results, and plot mean +/- std.
    '''
    # Diameter grid (same for all tasks)
    num_steps, min_diameter = 15, 0.0
    diameter_arr = np.linspace(min_diameter, maxDiameter, num_steps)

    algo_names = ["ParetoGreedy-Diameter", "PlainGreedy-Scaled", "TopK-Scaled", "GraphPruning"]

    def align_to_diameter_arr(diameters, covs):
        if len(diameters) == 0 or len(covs) == 0:
            return np.zeros_like(diameter_arr, dtype=float)
        diameters = np.array(diameters, dtype=float)
        covs = np.array(covs, dtype=float)

        # Ensure matching lengths
        min_len = min(len(diameters), len(covs))
        diameters = diameters[:min_len]
        covs = covs[:min_len]

        # Aggregate duplicate diameters by taking max coverage
        agg = {}
        for d, c in zip(diameters, covs):
            if d in agg:
                agg[d] = max(agg[d], c)
            else:
                agg[d] = c
        if len(agg) == 0:
            return np.zeros_like(diameter_arr, dtype=float)
        diameters_sorted = np.array(sorted(agg.keys()), dtype=float)
        covs_sorted = np.array([agg[d] for d in diameters_sorted], dtype=float)

        return np.interp(diameter_arr, diameters_sorted, covs_sorted, left=covs_sorted[0], right=covs_sorted[-1])

    # containers across tasks
    all_coverages = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}
    pareto_diams_all = []
    pareto_points_counts = []

    # iterate tasks
    for task_index in range(start_index, start_index + numTasks):
        # per-task containers
        task_coverages = {alg: [] for alg in algo_names}
        task_runtimes = {alg: [] for alg in algo_names}

        # Initialize Pareto teams object
        pareto_diam = paretoGraph(task=tasks_list[task_index],
                                  n_experts=experts_list[:numExperts],
                                  pairwise_costs=graphmat[:numExperts, :numExperts],
                                  size_univ=sizeUniverse,
                                  budget=1)

        diameters, best_coverages, _, _, runTime = pareto_diam.ParetoGreedyDiameter()
        diameters = np.array(diameters, dtype=float)

        # Interpolate to shared diameter grid
        task_coverages['ParetoGreedy-Diameter'] = list(align_to_diameter_arr(diameters, best_coverages))
        task_runtimes['ParetoGreedy-Diameter'].append(runTime)
        pareto_diams_all.extend(diameters)
        pareto_points_counts.append(len(diameters))

        # Plain Greedy (distance-scaled) baseline
        pg_diam, pg_covs, _, _, pg_time = pareto_diam.plainGreedyDistanceScaled()
        task_coverages['PlainGreedy-Scaled'] = list(align_to_diameter_arr(pg_diam, pg_covs))
        task_runtimes['PlainGreedy-Scaled'].append(pg_time)

        # Top-K (distance-scaled) baseline
        tk_diam, tk_covs, _, _, tk_time = pareto_diam.topKDistanceScaled()
        task_coverages['TopK-Scaled'] = list(align_to_diameter_arr(tk_diam, tk_covs))
        task_runtimes['TopK-Scaled'].append(tk_time)

        # Graph pruning baseline (already returns diameters)
        pr_radii, pr_coverages, _, _, pr_time = pareto_diam.graphPruning()
        pr_diameters = np.array(pr_radii, dtype=float)
        task_coverages['GraphPruning'] = list(align_to_diameter_arr(pr_diameters, pr_coverages))
        task_runtimes['GraphPruning'].append(pr_time)

        # convert per-task lists to numpy arrays and store in all_coverages
        for alg in algo_names:
            arr = np.array(task_coverages[alg], dtype=float)
            if arr.size == 0:
                arr = np.zeros_like(diameter_arr, dtype=float)
            all_coverages[alg].append(arr)
            runtimes = task_runtimes.get(alg, [])
            total_runtime = float(np.nansum(np.array(runtimes, dtype=float))) if len(runtimes) > 0 else 0.0
            all_runtimes[alg].append(total_runtime)

    # compute mean and std across tasks for each algorithm
    mean_coverages = {}
    std_coverages = {}
    for alg in algo_names:
        stacked = np.vstack(all_coverages[alg])  # shape (numTasks, len(diameter_arr))
        mean_coverages[alg] = np.mean(stacked, axis=0)
        std_coverages[alg] = np.std(stacked, axis=0)*0.5

    mean_pareto_points = int(np.round(np.mean(pareto_points_counts))) if len(pareto_points_counts) > 0 else 0
    if mean_pareto_points < 1:
        mean_pareto_points = 1

    # Plot mean coverage with shaded std band
    colors = plt.get_cmap("tab10").colors
    linestyles = ['-', '--', '-.', ':', (0, (3,1,1,1))]
    markers = ['o', 's', '^', 'v', 'D']

    fig, ax = plt.subplots(figsize=(9, 5.5))
    label_map = {}
    pareto_legend_handle = None
    for i, alg in enumerate(algo_names):
        mean = mean_coverages[alg]
        std = std_coverages[alg]*0.5
        is_pareto = alg == "ParetoGreedy-Diameter"
        marker_size = 7 if is_pareto else 6
        line_style = '-' if is_pareto else ':'

        if is_pareto:
            color = colors[3]
        else:
            color = colors[(i + 1) % len(colors)]
        marker = "D" if is_pareto else markers[i % len(markers)]
        zorder = 4 if is_pareto else 3

        label = rf"\texttt{{{alg}}} (ours)" if is_pareto else rf"\texttt{{{alg}}}"
        label_map[alg] = label
        if is_pareto:
            ax.plot(diameter_arr, mean,
                    label="_nolegend_",
                    color=color,
                    linestyle=line_style,
                    linewidth=1.8,
                    zorder=zorder)
            pareto_diams_unique = np.unique(np.array(pareto_diams_all, dtype=float))
            pareto_diams_unique = pareto_diams_unique[(pareto_diams_unique >= diameter_arr.min()) & (pareto_diams_unique <= diameter_arr.max())]
            if pareto_diams_unique.size > 0:
                keep_count = max(1, min(mean_pareto_points, pareto_diams_unique.size))
                keep_idx = np.linspace(0, pareto_diams_unique.size - 1, keep_count).astype(int)
                pareto_diams_unique = pareto_diams_unique[keep_idx]
            pareto_mean_vals = np.interp(pareto_diams_unique, diameter_arr, mean)
            ax.scatter(pareto_diams_unique, pareto_mean_vals,
                       label="_nolegend_",
                       color=color,
                       marker=marker,
                       s=marker_size**2,
                       edgecolor='k',
                       linewidths=0.6,
                       zorder=zorder + 1)
            from matplotlib.lines import Line2D
            pareto_legend_handle = Line2D([0], [0],
                                          color=color,
                                          linestyle=line_style,
                                          marker=marker,
                                          markersize=marker_size,
                                          markeredgewidth=1.1,
                                          markeredgecolor='k',
                                          linewidth=1.8)
        else:
            ax.plot(diameter_arr, mean,
                    label=label,
                    color=color,
                    linestyle=line_style,
                    marker=marker,
                    markersize=marker_size,
                    markeredgewidth=1.1,
                    markeredgecolor='k',
                    linewidth=1.8,
                    zorder=zorder)
        ax.fill_between(diameter_arr,
                        np.clip(mean - std, 0, 1),
                        np.clip(mean + std, 0, 1),
                        color=color,
                        alpha=0.18,
                        zorder=2)

    ax.set_xlabel(r'Team diameter ($d$)', fontsize=28)
    ax.set_ylabel(r'Mean task coverage ($\bar{f}$)', fontsize=28)
    ax.set_title("")
    ax.grid(alpha=0.3)
    ax.tick_params(axis='both', labelsize=24)

    # Legend handles (saved separately)
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        handle_map = dict(zip(labels, handles))
        pareto_label = label_map.get("ParetoGreedy-Diameter")
        if pareto_label and pareto_legend_handle is not None:
            handle_map[pareto_label] = pareto_legend_handle
        ordered_labels = []
        if pareto_label:
            ordered_labels.append(pareto_label)
        ordered_labels.extend([label_map[alg] for alg in algo_names if alg != "ParetoGreedy-Diameter"])
        ordered_handles = [handle_map[l] for l in ordered_labels if l in handle_map]

    # Save figure
    from pathlib import Path
    base_dir = Path.cwd().resolve().parents[1]
    plots_dir = base_dir / "plots" / "graph"
    plots_dir.mkdir(parents=True, exist_ok=True)
    safe_name = (dataset_name or "dataset").replace(" ", "_")
    out_path = plots_dir / f"{safe_name}_graph.pdf"
    fig.savefig(out_path, bbox_inches="tight")

    # Save legend as separate PDF
    if handles:
        legend_out_path = plots_dir / "graph_legend.pdf"
        if not legend_out_path.exists():
            legend_fig = plt.figure(figsize=(8, 2))
            legend_fig.legend(ordered_handles, ordered_labels, loc='center', ncol=2, fontsize=22, frameon=True)
            legend_fig.savefig(legend_out_path, bbox_inches="tight")
            plt.close(legend_fig)

    plt.show()

    # Runtime summary (mean ± std)
    runtime_lines = ["Runtime summary (mean ± std, seconds):"]
    for alg in algo_names:
        mean_rt = float(np.mean(all_runtimes[alg])) if len(all_runtimes[alg]) > 0 else 0.0
        std_rt = float(np.std(all_runtimes[alg])) if len(all_runtimes[alg]) > 0 else 0.0
        runtime_lines.append(f"  - {alg}: {mean_rt:.3f} ± {std_rt:.3f}")
    logging.info("\n".join(runtime_lines))

## Freelancer-1

In [None]:
findApproximateParetoSolutions(tasks_list=fl_tasks_1, experts_list=fl_experts_1, graphmat=fl_graphmat_1,
                               sizeUniverse=50, numExperts=50, numTasks=10,
                               maxDiameter=2 * np.max(fl_graphmat_1[:50, :50]),
                               dataset_name="Freelancer")

## IMDB

In [None]:
findApproximateParetoSolutions(tasks_list=imdb_tasks_1, experts_list=imdb_experts_1, graphmat=imdb_graphmat_1,
                               sizeUniverse=24, numExperts=150, numTasks=10,
                               maxDiameter=2 * np.max(imdb_graphmat_1[:150, :150]),
                               dataset_name="IMDB-1")

In [None]:
findApproximateParetoSolutions(tasks_list=imdb_tasks_2, experts_list=imdb_experts_2, graphmat=imdb_graphmat_2,
                               sizeUniverse=24, numExperts=150, numTasks=20,
                               maxDiameter=2 * np.max(imdb_graphmat_2[:150, :150]),
                               dataset_name="IMDB-2")

## Bbsm-1

In [None]:
findApproximateParetoSolutions(tasks_list=bbsm_tasks_1, experts_list=bbsm_experts_1, graphmat=bbsm_graphmat_1,
                               sizeUniverse=75, numExperts=150, numTasks=20,
                               maxDiameter=2 * np.max(bbsm_graphmat_1[:150, :150]),
                               dataset_name="Bbsm")