In [None]:
from paretoCardinalityTeams import *

import shutil
import matplotlib as mpl

# Enable LaTeX rendering if available (fallback to Matplotlib text otherwise)
if shutil.which("latex"):
    mpl.rcParams.update({
        "text.usetex": True,
        "font.family": "serif",
        "text.latex.preamble": r"\usepackage{amsmath}\usepackage{amssymb}",
    })
else:
    mpl.rcParams.update({"text.usetex": False})

#Import datasets
#IMDB
imdb_experts_1, imdb_tasks_1, imdb_costs_1, imdb_graphmat_1 = import_pickled_datasets('imdb', 1)
imdb_experts_2, imdb_tasks_2, imdb_costs_2, imdb_graphmat_2 = import_pickled_datasets('imdb', 2)
imdb_experts_3, imdb_tasks_3, imdb_costs_3, imdb_graphmat_3 = import_pickled_datasets('imdb', 3)

#Bibsonomy
bbsm_experts_1, bbsm_tasks_1, bbsm_costs_1, bbsm_graphmat_1 = import_pickled_datasets('bbsm', 1)
bbsm_experts_2, bbsm_tasks_2, bbsm_costs_2, bbsm_graphmat_2 = import_pickled_datasets('bbsm', 2)
bbsm_experts_3, bbsm_tasks_3, bbsm_costs_3, bbsm_graphmat_3 = import_pickled_datasets('bbsm', 3)

#Freelancer
fl_experts_1, fl_tasks_1, fl_costs_1, fl_graphmat_1 = import_pickled_datasets('freelancer', 1)
fl_experts_2, fl_tasks_2, fl_costs_2, fl_graphmat_2 = import_pickled_datasets('freelancer', 2)

In [None]:
def findApproximateParetoSolutions(tasks_list, experts_list, size_univ, numTasks, k_max, numExperts, dataset_name=""):
    '''
    Run algorithms over multiple tasks, aggregate results, and plot mean +/- std.
    Parameters:
    - tasks_list: List of tasks (each task is a list of required skills) 
    - experts_list: List of experts (each expert is a list of skills)
    - size_univ: Size of the universe of skills
    - numTasks: Number of tasks to evaluate
    - k_max: Maximum team size (cardinality)
    - numExperts: Number of experts to consider from experts_list
    - dataset_name: Name of the dataset for plotting
    '''
    algo_names = ["C-Greedy", "TopK", "Random"]

    # containers across tasks
    all_coverages = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}

    # iterate tasks
    for task_index in range(numTasks):
        # per-task containers
        task_coverages = {alg: [] for alg in algo_names}
        task_runtimes = {alg: [] for alg in algo_names}

        # Initialize Pareto cardinality object
        paretoCard = paretoCardinalityTeams(task=tasks_list[task_index],
                                       n_experts=experts_list[:numExperts],  # assuming numExperts is defined
                                       size_univ=size_univ,
                                       k_max=k_max)

        # C-Greedy (formerly ParetoGreedy-Cardinality)
        _, _, _, runTime = paretoCard.greedyCardinality()
        coverages = []
        current_coverage = 0
        for k in range(1, k_max + 1):
            if k in paretoCard.kSolDict:
                current_coverage = paretoCard.kSolDict[k]['Coverage']
            coverages.append(current_coverage)
        task_coverages['C-Greedy'] = coverages
        task_runtimes['C-Greedy'] = runTime

        # Top K
        paretoCard2 = paretoCardinalityTeams(task=tasks_list[task_index],
                                        n_experts=experts_list[:numExperts],
                                        size_univ=size_univ,
                                        k_max=k_max)
        _, _, _, runTime = paretoCard2.top_k()
        coverages = [paretoCard2.kSolDict.get(k, {'Coverage': 0})['Coverage'] for k in range(1, k_max + 1)]
        task_coverages['TopK'] = coverages
        task_runtimes['TopK'] = runTime

        # Random
        paretoCard3 = paretoCardinalityTeams(task=tasks_list[task_index],
                                        n_experts=experts_list[:numExperts],
                                        size_univ=size_univ,
                                        k_max=k_max)
        _, _, _, runTime = paretoCard3.random_selection()
        coverages = [paretoCard3.kSolDict.get(k, {'Coverage': 0})['Coverage'] for k in range(1, k_max + 1)]
        task_coverages['Random'] = coverages
        task_runtimes['Random'] = runTime

        # convert per-task lists to numpy arrays and store
        for alg in algo_names:
            arr = np.array(task_coverages[alg], dtype=float)
            all_coverages[alg].append(arr)
            all_runtimes[alg].append(task_runtimes[alg])

    # compute mean and std across tasks for each algorithm
    mean_coverages = {}
    std_coverages = {}
    for alg in algo_names:
        stacked = np.vstack(all_coverages[alg])  # shape (numTasks, k_max)
        mean_coverages[alg] = np.mean(stacked, axis=0)
        std_coverages[alg] = np.std(stacked, axis=0) * 0.5

    # Plot mean coverage with shaded std band (match knapsack teams formatting)
    tab10_colors = plt.get_cmap("tab10").colors
    color_map = {
        "TopK": tab10_colors[4],
        "C-Greedy": tab10_colors[1],
        "Random": tab10_colors[7],
    }
    marker_map = {
        "TopK": "o",
        "C-Greedy": "^",
        "Random": "s",
    }
    linestyle_map = {
        "TopK": (0, (1, 1)),
        "C-Greedy": (0, (3, 2)),
        "Random": (0, (2, 2)),
    }

    fig, ax = plt.subplots(figsize=(9, 5.5))
    for i, alg in enumerate(algo_names):
        mean = mean_coverages[alg]
        std = std_coverages[alg]
        marker_size = 7
        line_style = linestyle_map.get(alg, (0, (1, 1)))
        color = color_map.get(alg, tab10_colors[i % len(tab10_colors)])
        marker = marker_map.get(alg, 'o')
        zorder = 3
        ax.plot(range(1, k_max + 1), mean,
                label=rf"\texttt{{{alg}}}",
                color=color,
                linestyle=line_style,
                marker=marker,
                markersize=marker_size,
                markeredgewidth=1.1,
                markeredgecolor='k',
                linewidth=1.8,
                zorder=zorder)
        ax.fill_between(range(1, k_max + 1),
                        np.clip(mean - std, 0, 1),
                        np.clip(mean + std, 0, 1),
                        color=color,
                        alpha=0.18,
                        zorder=2)

    ax.set_xlabel(r'Cardinality, $k$', fontsize=28)
    ax.set_ylabel(r'Task coverage, $f$', fontsize=28)
    ax.set_title("")
    ax.grid(alpha=0.3)
    ax.tick_params(axis='both', labelsize=24)

    # Legend handles (saved separately)
    handles, labels = ax.get_legend_handles_labels()

    # Save figure
    from pathlib import Path
    base_dir = Path.cwd().resolve().parents[1]
    plots_dir = base_dir / "plots" / "cardinality"
    plots_dir.mkdir(parents=True, exist_ok=True)
    safe_name = (dataset_name or "dataset").replace(" ", "_")
    out_path = plots_dir / f"{safe_name}_cardinality.pdf"
    fig.savefig(out_path, bbox_inches="tight")

    # Save legend as separate PDF
    if handles:
        legend_out_path = plots_dir / "cardinality_legend.pdf"
        if not legend_out_path.exists():
            legend_fig = plt.figure(figsize=(8, 2))
            legend_fig.legend(handles, labels, loc='center', ncol=3, fontsize=22, frameon=True)
            legend_fig.savefig(legend_out_path, bbox_inches="tight")
            plt.close(legend_fig)

    plt.show()

    # Runtime summary (mean ± std)
    runtime_lines = ["Runtime summary (mean ± std, seconds):"]
    for alg in algo_names:
        mean_rt = float(np.mean(all_runtimes[alg])) if len(all_runtimes[alg]) > 0 else 0.0
        std_rt = float(np.std(all_runtimes[alg])) if len(all_runtimes[alg]) > 0 else 0.0
        runtime_lines.append(f"  - {alg}: {mean_rt:.3f} ± {std_rt:.3f}")
    logging.info("\n".join(runtime_lines))

    return None


### IMDB

In [None]:
#Parameters
numTasks = 50  # number of tasks
k_max = 15 

numExperts = 75
findApproximateParetoSolutions(imdb_tasks_1, imdb_experts_1, 24, numTasks, k_max, numExperts, "IMDB-1")

In [None]:
numExperts = 100
findApproximateParetoSolutions(imdb_tasks_2, imdb_experts_2, 24, numTasks, k_max, numExperts, "IMDB-2")

### Bibsonomy

In [None]:
numExperts = 150
findApproximateParetoSolutions(bbsm_tasks_1, bbsm_experts_1, 75, numTasks, k_max, numExperts, "BBSM")

### Freelancer

In [None]:
numExperts = 100
findApproximateParetoSolutions(fl_tasks_1, fl_experts_1, 50, numTasks, k_max, numExperts, "Freelancer")