### Pareto Teams - Knapsack Cost

In [None]:
from paretoKnapsackTeams import *
from utils import *
import matplotlib.cm as cm

#Import datasets
#IMDB
imdb_experts_1, imdb_tasks_1, imdb_costs_1, imdb_graphmat_1 = import_pickled_datasets('imdb', 1)
imdb_experts_2, imdb_tasks_2, imdb_costs_2, imdb_graphmat_2 = import_pickled_datasets('imdb', 2)
imdb_experts_3, imdb_tasks_3, imdb_costs_3, imdb_graphmat_3 = import_pickled_datasets('imdb', 3)

#Bibsonomy
bbsm_experts_1, bbsm_tasks_1, bbsm_costs_1, bbsm_graphmat_1 = import_pickled_datasets('bbsm', 1)
bbsm_experts_2, bbsm_tasks_2, bbsm_costs_2, bbsm_graphmat_2 = import_pickled_datasets('bbsm', 2)
bbsm_experts_3, bbsm_tasks_3, bbsm_costs_3, bbsm_graphmat_3 = import_pickled_datasets('bbsm', 3)

#Freelancer
fl_experts_1, fl_tasks_1, fl_costs_1, fl_graphmat_1 = import_pickled_datasets('freelancer', 1)
fl_experts_2, fl_tasks_2, fl_costs_2, fl_graphmat_2 = import_pickled_datasets('freelancer', 2)

### IMDB -- Original plotting per Task

In [None]:
def findApproximateParetoSolutions(tasks_list, experts_list, costs_list, 
                                   sizeUniverse, numExperts, numTasks, maxBudget):
    '''
    Wrapper to run algorithms on different datasets
    '''
    #Cost values to assign to experts
    sampleCostVals = [i for i in range(5, 250, 20)]
    costs_list = [sampleCostVals[np.random.randint(low=0, high=len(sampleCostVals))] for i in range(len(experts_list))]

    for task_index in range(numTasks):
    
        #Task 8, numExperts 75, budget=45
        #Keep track of coverages for different budgets and runtimes
        imdb_coverages = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[]}
        imdb_costs = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[]}
        imdb_runtimes = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[] }

        #Initialize Cost Array with (1 + eps) log scale
        eps, min_cost = 0.1, 5
        cost_arr = [min_cost]

        while min_cost*(1+eps) < maxBudget:
            min_cost = round(min_cost*(1 + eps), 2)
            cost_arr.append(min_cost) 
        cost_arr.append(maxBudget)

        for i, budgetVal in enumerate(cost_arr):
            # logging.info("==="*50)
            # logging.info("Finding Teams for Budget={}".format(budgetVal))
            # logging.info("==="*50)

            #Initialize Pareto teams object
            paretoTeams = paretoKnapsack(task=tasks_list[task_index], 
                                            n_experts = experts_list[:numExperts],
                                            costs = costs_list, size_univ = sizeUniverse, 
                                            budget=budgetVal)
            
            #Plain Greedy
            plainGreedy_explist, plainGreedy_solskills, plainGreedy_cov, plainGreedy_cost, plainGreedy_time = paretoTeams.plainGreedy()
            imdb_coverages['PlainGreedy'].append(plainGreedy_cov)
            imdb_costs['PlainGreedy'].append(plainGreedy_cost)
            imdb_runtimes['PlainGreedy'].append(plainGreedy_time)

            #Greedy Plus
            greedyPlus_explist, greedyPlus_solskills, greedyPlus_cov, greedyPlus_cost, greedyPlus_time = paretoTeams.greedyPlus()
            imdb_coverages['GreedyPlus'].append(greedyPlus_cov)
            imdb_costs['GreedyPlus'].append(greedyPlus_cost)
            imdb_runtimes['GreedyPlus'].append(greedyPlus_time)

            #Two Guess Plain Greedy
            twoGuessPG_explist, twoGuessPG_solskills, twoGuessPG_cov, twoGuessPG_cost, twoGuessPG_time = paretoTeams.twoGuessPlainGreedy()
            imdb_coverages['TwoGuessPlainGreedy'].append(twoGuessPG_cov)
            imdb_costs['TwoGuessPlainGreedy'].append(twoGuessPG_cost)
            imdb_runtimes['TwoGuessPlainGreedy'].append(twoGuessPG_time)

            #One Guess Greedy Plus
            oneGuessGP_explist, oneGuessGP_solskills, oneGuessGP_cov, oneGuessGP_cost, oneGuessGP_time = paretoTeams.oneGuessGreedyPlus()
            imdb_coverages['OneGuessGreedyPlus'].append(oneGuessGP_cov)
            imdb_costs['OneGuessGreedyPlus'].append(oneGuessGP_cost)
            imdb_runtimes['OneGuessGreedyPlus'].append(oneGuessGP_time)

            # #Break if all coverages are max
            # if plainGreedy_cov == 1 and greedyPlus_cov == 1 and twoGuessPG_cov == 1 and oneGuessGP_cov == 1:
            #     break

        #Compute Prefix Pareto Greedy 1 Guess and 2 Guess
        paretoTeams = paretoKnapsack(task=tasks_list[task_index], 
                                        n_experts = experts_list[:numExperts],
                                        costs = costs_list, size_univ = sizeUniverse, 
                                        budget=maxBudget)

        imdb_costs["PrefixPareto-1Guess"], imdb_coverages["PrefixPareto-1Guess"], cost_coverage_map, prefixpareto_runtime = paretoTeams.prefixParetoGreedy_1Guess()
        imdb_runtimes["PrefixPareto-1Guess"].append(prefixpareto_runtime)

        imdb_costs["PrefixPareto-2Guess"], imdb_coverages["PrefixPareto-2Guess"], cost_coverage_map, prefixpareto_runtime = paretoTeams.prefixParetoGreedy_2Guess()
        imdb_runtimes["PrefixPareto-2Guess"].append(prefixpareto_runtime)

        #Plot performance and runtimes
        algo_names = list(imdb_coverages.keys())
        colors = cm.magma(np.linspace(0.01, 0.8, len(algo_names)))  # Generate 4 distinct colors

        fig, axs= plt.subplots(2, 1, figsize=(8, 6), gridspec_kw={'height_ratios': [2, 1], 'hspace': 0.35})

        for alg, line_color in zip(algo_names, colors):
            if alg.startswith("PrefixPareto"):
                continue
            else:
                max_coverage_index = np.argmax(imdb_coverages[alg])
                axs[0].plot(cost_arr[:max_coverage_index+2], imdb_coverages[alg][:max_coverage_index+2], '*--', alpha=0.5, label=alg, color=line_color) #Plot coverage vs. cost
                # axs[0].plot(imdb_costs[alg][:max_coverage_index+2], imdb_coverages[alg][:max_coverage_index+2], '*--', alpha=0.5, label=alg, color=line_color) #Plot coverage vs. cost

        axs[0].plot(imdb_costs["PrefixPareto-1Guess"], imdb_coverages["PrefixPareto-1Guess"], 'o--', alpha=0.5, label="PrefixPareto-1Guess", color='red')
        axs[0].plot(imdb_costs["PrefixPareto-2Guess"], imdb_coverages["PrefixPareto-2Guess"], '^--', alpha=0.6, label="PrefixPareto-2Guess", color='orange')

        axs[0].set_title('Coverage vs. Cost')
        axs[0].set_ylabel("Task Coverage")
        axs[0].set_xlabel("Cost Budget")
        axs[0].grid(alpha=0.3)

        # Bottom: total runtimes per algorithm (bar chart)
        sums = []
        for alg in algo_names:
            times = imdb_runtimes.get(alg, [])
            arr = np.array(times, dtype=float) if len(times) > 0 else np.array([0.0])
            totals = float(np.nansum(arr))
            sums.append(totals)

        x = np.arange(len(algo_names))
        bars = axs[1].bar(x, sums, width=0.3, color=[colors[i] for i in range(len(algo_names))])
        axs[1].set_xticks(x)
        axs[1].set_xticklabels(algo_names, ha='center', fontsize=7)
        axs[1].set_title('Total Runtime per Algorithm')
        axs[1].set_ylabel('Total Runtime (s)')
        axs[1].set_xlabel('Algorithm')
        axs[1].grid(axis='y', alpha=0.3)

        # Annotate bars with values
        for b, v in zip(bars, sums):
            axs[1].text(b.get_x() + b.get_width() / 2, v, f"{v:.3f}", ha='center', va='bottom', fontsize=9)

        # Single legend from top subplot
        handles, labels = axs[0].get_legend_handles_labels()
        if handles:
            fig.legend(handles, labels, loc='upper center', ncol=3)

        # plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.show()

### IMDB -- Average Plotting across Tasks

In [None]:
def findApproximateParetoSolutions(tasks_list, experts_list, costs_list,
                                   sizeUniverse, numExperts, numTasks, maxBudget):
    '''
    Run algorithms over multiple tasks, aggregate results, and plot mean +/- std.
    '''
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    # Cost grid (same for all tasks)
    eps, min_cost = 0.1, 5
    cost_arr = [min_cost]
    while min_cost*(1+eps) < maxBudget:
        min_cost = round(min_cost*(1 + eps), 2)
        cost_arr.append(min_cost)
    cost_arr.append(maxBudget)
    cost_arr = np.array(cost_arr)

    algo_names = ["PlainGreedy", "GreedyPlus", "TwoGuessPlainGreedy", "OneGuessGreedyPlus", "PrefixPareto-1Guess", "PrefixPareto-2Guess"]

    # containers across tasks
    all_coverages = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}

    # iterate tasks
    for task_index in range(numTasks):
        # per-task containers (will be appended to across budgets)
        imdb_coverages = {alg: [] for alg in algo_names}
        imdb_costs = {alg: [] for alg in algo_names}
        imdb_runtimes = {alg: [] for alg in algo_names}

        # pick random / provided costs for this run (keeps API similar)
        sampleCostVals = [i for i in range(5, 250, 20)]
        costs_list = [sampleCostVals[np.random.randint(low=0, high=len(sampleCostVals))] for _ in range(len(experts_list))]

        for budgetVal in cost_arr:
            # Initialize Pareto teams object
            paretoTeams = paretoKnapsack(task=tasks_list[task_index],
                                         n_experts=experts_list[:numExperts],
                                         costs=costs_list, size_univ=sizeUniverse,
                                         budget=budgetVal)

            # Plain Greedy
            _, _, plainCov, plainCost, plainTime = paretoTeams.plainGreedy()
            imdb_coverages['PlainGreedy'].append(plainCov)
            imdb_costs['PlainGreedy'].append(plainCost)
            imdb_runtimes['PlainGreedy'].append(plainTime)

            # Greedy Plus
            _, _, gpCov, gpCost, gpTime = paretoTeams.greedyPlus()
            imdb_coverages['GreedyPlus'].append(gpCov)
            imdb_costs['GreedyPlus'].append(gpCost)
            imdb_runtimes['GreedyPlus'].append(gpTime)

            # Two Guess Plain Greedy
            _, _, tgCov, tgCost, tgTime = paretoTeams.twoGuessPlainGreedy()
            imdb_coverages['TwoGuessPlainGreedy'].append(tgCov)
            imdb_costs['TwoGuessPlainGreedy'].append(tgCost)
            imdb_runtimes['TwoGuessPlainGreedy'].append(tgTime)

            # One Guess Greedy Plus
            _, _, ogCov, ogCost, ogTime = paretoTeams.oneGuessGreedyPlus()
            imdb_coverages['OneGuessGreedyPlus'].append(ogCov)
            imdb_costs['OneGuessGreedyPlus'].append(ogCost)
            imdb_runtimes['OneGuessGreedyPlus'].append(ogTime)

        # Prefix Pareto (computed once per task at full budget)
        paretoTeams_full = paretoKnapsack(task=tasks_list[task_index],
                                          n_experts=experts_list[:numExperts],
                                          costs=costs_list, size_univ=sizeUniverse,
                                          budget=maxBudget)
        pp1_costs, pp1_coverages, _, pp1_time = paretoTeams_full.prefixParetoGreedy_1Guess()
        pp2_costs, pp2_coverages, _, pp2_time = paretoTeams_full.prefixParetoGreedy_2Guess()

        # Ensure prefix pareto coverages align with the cost_arr length if possible.
        # We'll resample/pad to cost_arr length: simplest is to interpolate by cost.
        def align_to_cost_arr(costs, covs):
            if len(costs) == 0:
                return np.zeros_like(cost_arr, dtype=float)
            costs = np.array(costs)
            covs = np.array(covs)
            # if cost coverage mapping is strictly increasing in costs, we can interpolate
            return np.interp(cost_arr, costs, covs, left=covs[0], right=covs[-1])

        imdb_coverages['PrefixPareto-1Guess'] = list(align_to_cost_arr(pp1_costs, pp1_coverages))
        imdb_coverages['PrefixPareto-2Guess'] = list(align_to_cost_arr(pp2_costs, pp2_coverages))
        imdb_runtimes['PrefixPareto-1Guess'].append(pp1_time)
        imdb_runtimes['PrefixPareto-2Guess'].append(pp2_time)

        # convert per-task lists to numpy arrays and store in all_coverages
        for alg in algo_names:
            arr = np.array(imdb_coverages[alg], dtype=float)
            if arr.size == 0:
                arr = np.zeros_like(cost_arr, dtype=float)
            all_coverages[alg].append(arr)
            # store total runtime per task (sum over budgets or single value for prefix)
            runtimes = imdb_runtimes.get(alg, [])
            total_runtime = float(np.nansum(np.array(runtimes, dtype=float))) if len(runtimes) > 0 else 0.0
            all_runtimes[alg].append(total_runtime)

    # compute mean and std across tasks for each algorithm
    mean_coverages = {}
    std_coverages = {}
    for alg in algo_names:
        stacked = np.vstack(all_coverages[alg])  # shape (numTasks, len(cost_arr))
        mean_coverages[alg] = np.mean(stacked, axis=0)
        std_coverages[alg] = np.std(stacked, axis=0)*0.5

    # Plot mean coverage with shaded std band
    # plotting: distinct linestyles + markers; plot prefix pareto last & bolder
    colors = cm.magma(np.linspace(0.01, 0.8, len(algo_names)))
    linestyles = ['-', '--', '-.', ':', (0, (3,1,1,1)), (0, (1,1))]
    markers = ['o', 's', '^', 'v', 'D', 'X']

    # ensure PrefixPareto lines plotted last
    plot_order = [a for a in algo_names if not a.startswith('PrefixPareto')] + \
                [a for a in algo_names if a.startswith('PrefixPareto')]    
    
    fig, ax = plt.subplots(figsize=(8, 5))
    for i, alg in enumerate(plot_order):
        mean = mean_coverages[alg]
        std = std_coverages[alg]

        ax.plot(cost_arr, mean,
                label=alg,
                color=colors[algo_names.index(alg)],
                linestyle=linestyles[i % len(linestyles)],
                marker=markers[i % len(markers)],
                markersize=5,
                markeredgewidth=0.8,
                markeredgecolor='k',
                linewidth=1.2,
                zorder=3)
        ax.fill_between(cost_arr,
                        np.clip(mean - std, 0, 1),
                        np.clip(mean + std, 0, 1),
                        color=colors[algo_names.index(alg)],
                        alpha=0.18,
                        zorder=2)

    ax.set_xlabel('Cost Budget')
    ax.set_ylabel('Mean Task Coverage')
    ax.set_title('Mean Coverage across tasks (shaded = ±1 std)')
    ax.grid(alpha=0.3)
    ax.legend(fontsize=8, ncol=2)
    plt.show()

    # Plot total runtime per algorithm with mean +/- std bars
    means_rt = [np.mean(all_runtimes[alg]) for alg in algo_names]
    stds_rt = [np.std(all_runtimes[alg]) for alg in algo_names]
    x = np.arange(len(algo_names))
    fig2, ax2 = plt.subplots(figsize=(8, 3))
    bars = ax2.bar(x, means_rt, yerr=stds_rt, capsize=5, color=[colors[i] for i in range(len(algo_names))])
    ax2.set_xticks(x)
    ax2.set_xticklabels(algo_names, rotation=30, ha='right', fontsize=8)
    ax2.set_ylabel('Total Runtime (s)')
    ax2.set_title('Mean Total Runtime per Algorithm (±std)')
    ax2.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

## IMDB

In [None]:
findApproximateParetoSolutions(tasks_list=imdb_tasks_1, experts_list=imdb_experts_1, costs_list=imdb_costs_1, 
                               sizeUniverse=24, numExperts=75, numTasks=10, maxBudget=200)

In [None]:
findApproximateParetoSolutions(tasks_list=imdb_tasks_3, experts_list=imdb_experts_3, costs_list=imdb_costs_3, 
                               sizeUniverse=24, numExperts=200, numTasks=10, maxBudget=200)

## Bbsm-1

In [None]:
findApproximateParetoSolutions(tasks_list=bbsm_tasks_2, experts_list=bbsm_experts_2, costs_list=bbsm_costs_2, 
                               sizeUniverse=75, numExperts=100, numTasks=10, maxBudget=200)

## Freelancer-1

In [None]:
findApproximateParetoSolutions(tasks_list=fl_tasks_1, experts_list=fl_experts_1, costs_list=fl_costs_1, 
                               sizeUniverse=50, numExperts=100, numTasks=20, maxBudget=400)

In [None]:
#Summarize results in tables in overleaf - different datasets and sizes
#New datasets