### Pareto Teams - Knapsack Cost

In [None]:
from paretoKnapsack import *
from utils import *
import matplotlib.cm as cm

#Import datasets
#IMDB
imdb_experts_1, imdb_tasks_1, imdb_costs_1, imdb_graphmat_1 = import_pickled_datasets('imdb', 1)
imdb_experts_2, imdb_tasks_2, imdb_costs_2, imdb_graphmat_2 = import_pickled_datasets('imdb', 2)
imdb_experts_3, imdb_tasks_3, imdb_costs_3, imdb_graphmat_3 = import_pickled_datasets('imdb', 3)

#Bibsonomy
bbsm_experts_1, bbsm_tasks_1, bbsm_costs_1, bbsm_graphmat_1 = import_pickled_datasets('bbsm', 1)
bbsm_experts_2, bbsm_tasks_2, bbsm_costs_2, bbsm_graphmat_2 = import_pickled_datasets('bbsm', 2)
bbsm_experts_3, bbsm_tasks_3, bbsm_costs_3, bbsm_graphmat_3 = import_pickled_datasets('bbsm', 3)

#Freelancer
fl_experts_1, fl_tasks_1, fl_costs_1, fl_graphmat_1 = import_pickled_datasets('freelancer', 1)
fl_experts_2, fl_tasks_2, fl_costs_2, fl_graphmat_2 = import_pickled_datasets('freelancer', 2)

### IMDB

In [None]:
def findApproximateParetoSolutions(tasks_list, experts_list, costs_list, 
                                   sizeUniverse, numExperts, numTasks, maxBudget):
    '''
    Wrapper to run algorithms on different datasets
    '''
    for task_index in range(numTasks):
    
        #Task 8, numExperts 75, budget=45
        #Keep track of coverages for different budgets and runtimes
        imdb_coverages = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[]}
        imdb_costs = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[]}
        imdb_runtimes = {"PlainGreedy":[], "GreedyPlus":[], "TwoGuessPlainGreedy":[], "OneGuessGreedyPlus":[], "PrefixPareto-1Guess":[], "PrefixPareto-2Guess":[] }

        #Initialize Cost Array with (1 + eps) log scale
        eps, min_cost = 0.1, 5
        cost_arr = [min_cost]

        while min_cost*(1+eps) < maxBudget:
            min_cost = round(min_cost*(1 + eps), 2)
            cost_arr.append(min_cost) 
        cost_arr.append(maxBudget)

        for i, budgetVal in enumerate(cost_arr):
            # logging.info("==="*50)
            # logging.info("Finding Teams for Budget={}".format(budgetVal))
            # logging.info("==="*50)

            #Initialize Pareto teams object
            paretoTeams = paretoKnapsack(task=tasks_list[task_index], 
                                            n_experts = experts_list[:numExperts],
                                            costs = costs_list, size_univ = sizeUniverse, 
                                            budget=budgetVal)
            
            #Plain Greedy
            plainGreedy_explist, plainGreedy_solskills, plainGreedy_cov, plainGreedy_cost, plainGreedy_time = paretoTeams.plainGreedy()
            imdb_coverages['PlainGreedy'].append(plainGreedy_cov)
            imdb_costs['PlainGreedy'].append(plainGreedy_cost)
            imdb_runtimes['PlainGreedy'].append(plainGreedy_time)

            #Greedy Plus
            greedyPlus_explist, greedyPlus_solskills, greedyPlus_cov, greedyPlus_cost, greedyPlus_time = paretoTeams.greedyPlus()
            imdb_coverages['GreedyPlus'].append(greedyPlus_cov)
            imdb_costs['GreedyPlus'].append(greedyPlus_cost)
            imdb_runtimes['GreedyPlus'].append(greedyPlus_time)

            #Two Guess Plain Greedy
            twoGuessPG_explist, twoGuessPG_solskills, twoGuessPG_cov, twoGuessPG_cost, twoGuessPG_time = paretoTeams.twoGuessPlainGreedy()
            imdb_coverages['TwoGuessPlainGreedy'].append(twoGuessPG_cov)
            imdb_costs['TwoGuessPlainGreedy'].append(twoGuessPG_cost)
            imdb_runtimes['TwoGuessPlainGreedy'].append(twoGuessPG_time)

            #One Guess Greedy Plus
            oneGuessGP_explist, oneGuessGP_solskills, oneGuessGP_cov, oneGuessGP_cost, oneGuessGP_time = paretoTeams.oneGuessGreedyPlus()
            imdb_coverages['OneGuessGreedyPlus'].append(oneGuessGP_cov)
            imdb_costs['OneGuessGreedyPlus'].append(oneGuessGP_cost)
            imdb_runtimes['OneGuessGreedyPlus'].append(oneGuessGP_time)

            # #Break if all coverages are max
            # if plainGreedy_cov == 1 and greedyPlus_cov == 1 and twoGuessPG_cov == 1 and oneGuessGP_cov == 1:
            #     break

        #Compute Prefix Pareto Greedy 1 Guess and 2 Guess
        paretoTeams = paretoKnapsack(task=tasks_list[task_index], 
                                        n_experts = experts_list[:numExperts],
                                        costs = costs_list, size_univ = sizeUniverse, 
                                        budget=maxBudget)

        imdb_costs["PrefixPareto-1Guess"], imdb_coverages["PrefixPareto-1Guess"], cost_coverage_map, prefixpareto_runtime = paretoTeams.prefixParetoGreedy_1Guess()
        imdb_runtimes["PrefixPareto-1Guess"].append(prefixpareto_runtime)

        imdb_costs["PrefixPareto-2Guess"], imdb_coverages["PrefixPareto-2Guess"], cost_coverage_map, prefixpareto_runtime = paretoTeams.prefixParetoGreedy_2Guess()
        imdb_runtimes["PrefixPareto-2Guess"].append(prefixpareto_runtime)

        #Plot performance and runtimes
        algo_names = list(imdb_coverages.keys())
        colors = cm.magma(np.linspace(0.01, 0.8, 4))  # Generate 4 distinct colors

        fig, axs= plt.subplots(2, 1, figsize=(10, 6))

        for alg, line_color in zip(algo_names, colors):
            if alg == "PrefixPareto-1Guess":
                axs[0].plot(imdb_costs[alg], imdb_coverages[alg], 'o--', alpha=0.6, label=alg, color='red')
            elif alg == "PrefixPareto-2Guess":
                axs[0].plot(imdb_costs[alg], imdb_coverages[alg], '^--', alpha=0.6, label=alg, color='black')
            else:
                max_coverage_index = np.argmax(imdb_coverages[alg])
                axs[0].plot(cost_arr[:max_coverage_index+2], imdb_coverages[alg][:max_coverage_index+2], '*--', alpha=0.5, label=alg, color=line_color) #Plot coverage vs. cost

        axs[0].set_title('Coverage vs. Cost')
        axs[0].set_ylabel("Task Coverage")
        axs[0].set_xlabel("Cost Budget")
        axs[0].grid(alpha=0.3)

        # Bottom: total runtimes per algorithm (bar chart)
        sums = []
        for alg in algo_names:
            times = imdb_runtimes.get(alg, [])
            arr = np.array(times, dtype=float) if len(times) > 0 else np.array([0.0])
            totals = float(np.nansum(arr))
            sums.append(totals)

        x = np.arange(len(algo_names))
        bars = axs[1].bar(x, sums, width=0.3, color=[colors[i] for i in range(len(algo_names))])
        axs[1].set_xticks(x)
        axs[1].set_xticklabels(algo_names, ha='center', rotation=30)
        axs[1].set_title('Total Runtime per Algorithm')
        axs[1].set_ylabel('Total Runtime (s)')
        axs[1].set_xlabel('Algorithm')
        axs[1].grid(axis='y', alpha=0.3)

        # Annotate bars with values
        for b, v in zip(bars, sums):
            axs[1].text(b.get_x() + b.get_width() / 2, v, f"{v:.2f}", ha='center', va='bottom', fontsize=9)

        # Single legend from top subplot
        handles, labels = axs[0].get_legend_handles_labels()
        if handles:
            fig.legend(handles, labels, loc='upper center', ncol=max(3, len(handles)))

        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.show()

## IMDB-1

In [None]:
findApproximateParetoSolutions(tasks_list=imdb_tasks_1, experts_list=imdb_experts_1, costs_list=imdb_costs_1, 
                               sizeUniverse=24, numExperts=75, numTasks=10, maxBudget=70)

## Bbsm-1

In [None]:
findApproximateParetoSolutions(tasks_list=bbsm_tasks_2, experts_list=bbsm_experts_2, costs_list=bbsm_costs_2, 
                               sizeUniverse=75, numExperts=300, numTasks=10, maxBudget=60)

## Freelancer-1

In [None]:
findApproximateParetoSolutions(tasks_list=fl_tasks_1, experts_list=fl_experts_1, costs_list=fl_costs_1, 
                               sizeUniverse=50, numExperts=150, numTasks=10, maxBudget=120)

In [None]:
#Separate GreedyParetoPrefix in its own function
#Create version for One-Guess as well
#Set up code to run GreedyParetoPrefix algorithm on any dataset
#Figure out total runtime tracking
#Summarize results in tables in overleaf - different datasets and sizes
#New datasets