In [None]:
from paretoKnapsackRestaurants import *

In [None]:
#function to sample numItems from the dataset
def sample_dataset(simMatrix, item_ids, item_costs, numItems):
    '''
    Samples a subset of items from the dataset.
    Args:
        simMatrix (np.ndarray): Similarity matrix of shape (N, N).
        item_ids (list): List of item IDs of length N.
        item_costs (list): List of item costs of length N.
        numItems (int): Number of items to sample.
    Returns:
        sampled_simMatrix (np.ndarray): Sampled similarity matrix of shape (numItems, numItems).
        sampled_item_ids (list): List of sampled item IDs of length numItems.
        sampled_item_costs (list): List of sampled item costs of length numItems.
    '''
    sampled_indices = np.random.choice(len(item_ids), size=numItems, replace=False)
    sampled_simMatrix = simMatrix[np.ix_(sampled_indices, sampled_indices)]
    sampled_item_ids = [item_ids[i] for i in sampled_indices]
    sampled_item_costs = [item_costs[i] for i in sampled_indices]

    return sampled_simMatrix, sampled_item_ids, sampled_item_costs

In [None]:
# #Sample dataset
# numItems = 100 
# phoenix_simMatrix_sample, phoenix_ids_sample, phoenix_costs_sample = sample_dataset(phoenix_simMatrix, phoenix_ids, phoenix_costs, numItems)

# paretoPhoenix = paretoKnapsackRestaurants(n_items=phoenix_ids_sample,
#                                           costs=phoenix_costs_sample,
#                                           simMatrix=phoenix_simMatrix_sample,
#                                           budget=50)

### Yelp - Average plotting across different random samples

In [None]:
def findApproximateParetoSolutions(simMatrix, item_ids, item_costs, sample_size, numSamples, maxBudget):
    '''
    Run algorithms over multiple random samples of the dataset, aggregate results, and plot mean +/- std.
    Args:
        simMatrix (np.ndarray): Similarity matrix of shape (N, N).
        item_ids (list): List of item IDs of length N.      
        item_costs (list): List of item costs of length N.
        sample_size (int): Number of items to sample from the dataset.
        numSamples (int): Number of random samples to run.
        maxBudget (float): Maximum budget for the knapsack problem.
    '''
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    # Cost grid (same for all samples)
    eps, min_cost = 0.1, 1
    cost_arr = [min_cost]
    while min_cost*(1+eps) < maxBudget:
        min_cost = round(min_cost*(1 + eps), 2)
        cost_arr.append(min_cost)
    cost_arr.append(maxBudget)
    cost_arr = np.array(cost_arr)

    algo_names = ["PlainGreedy", "GreedyPlus", "TwoGuessPlainGreedy", "OneGuessGreedyPlus", "PrefixPareto-1Guess", "PrefixPareto-2Guess"]

    # containers across samples
    all_objectives = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}

    print(f"Starting processing {numSamples} samples with sample size {sample_size} and max budget {maxBudget}")

    # iterate samples
    for sample_index in range(numSamples):
        print(f"Processing sample {sample_index + 1}/{numSamples}")
        
        # per-sample containers (will be appended to across budgets)
        sample_objectives = {alg: [] for alg in algo_names}
        sample_costs = {alg: [] for alg in algo_names}
        sample_runtimes = {alg: [] for alg in algo_names}

        # sample the dataset
        sampled_simMatrix, sampled_item_ids, sampled_item_costs = sample_dataset(simMatrix, item_ids, item_costs, sample_size)

        for budgetVal in cost_arr:
            # Initialize Pareto restaurants object
            paretoRest = paretoKnapsackRestaurants(n_items=sampled_item_ids,
                                                   costs=sampled_item_costs,
                                                   simMatrix=sampled_simMatrix,
                                                   budget=budgetVal)

            # Plain Greedy
            _, curr_objective, curr_cost, runTime = paretoRest.plainGreedy()
            sample_objectives['PlainGreedy'].append(curr_objective)
            sample_costs['PlainGreedy'].append(curr_cost)
            sample_runtimes['PlainGreedy'].append(runTime)

            # Greedy Plus
            _, curr_objective, curr_cost, runTime = paretoRest.greedyPlus()
            sample_objectives['GreedyPlus'].append(curr_objective)
            sample_costs['GreedyPlus'].append(curr_cost)
            sample_runtimes['GreedyPlus'].append(runTime)

            # Two Guess Plain Greedy
            _, curr_objective, curr_cost, runTime = paretoRest.twoGuessPlainGreedy()
            sample_objectives['TwoGuessPlainGreedy'].append(curr_objective)
            sample_costs['TwoGuessPlainGreedy'].append(curr_cost)
            sample_runtimes['TwoGuessPlainGreedy'].append(runTime)

            # One Guess Greedy Plus
            _, curr_objective, curr_cost, runTime = paretoRest.oneGuessGreedyPlus()
            sample_objectives['OneGuessGreedyPlus'].append(curr_objective)
            sample_costs['OneGuessGreedyPlus'].append(curr_cost)
            sample_runtimes['OneGuessGreedyPlus'].append(runTime)

        # Prefix Pareto (computed once per sample at full budget)
        print(f"  Running PrefixPareto algorithms for sample {sample_index + 1}")
        paretoRest_full = paretoKnapsackRestaurants(n_items=sampled_item_ids,
                                                    costs=sampled_item_costs,
                                                    simMatrix=sampled_simMatrix,
                                                    budget=maxBudget)
        pp1_costs, pp1_objectives, _, pp1_time = paretoRest_full.prefixParetoGreedy_1Guess()
        pp2_costs, pp2_objectives, _, pp2_time = paretoRest_full.prefixParetoGreedy_2Guess()

        # Ensure prefix pareto objectives align with the cost_arr length if possible.
        # We'll resample/pad to cost_arr length: simplest is to interpolate by cost.
        def align_to_cost_arr(costs, objs):
            if len(costs) == 0:
                return np.zeros_like(cost_arr, dtype=float)
            costs = np.array(costs)
            objs = np.array(objs)
            # if cost objective mapping is strictly increasing in costs, we can interpolate
            return np.interp(cost_arr, costs, objs, left=objs[0], right=objs[-1])

        sample_objectives['PrefixPareto-1Guess'] = list(align_to_cost_arr(pp1_costs, pp1_objectives))
        sample_objectives['PrefixPareto-2Guess'] = list(align_to_cost_arr(pp2_costs, pp2_objectives))
        sample_runtimes['PrefixPareto-1Guess'].append(pp1_time)
        sample_runtimes['PrefixPareto-2Guess'].append(pp2_time)

        # # Min-max normalize objectives per algorithm for comparability across samples
        # for alg in algo_names:
        #     objs = sample_objectives[alg]
        #     if objs:
        #         min_obj = min(objs)
        #         max_obj = max(objs)
        #         if max_obj > min_obj:
        #             sample_objectives[alg] = [(obj - min_obj) / (max_obj - min_obj) for obj in objs]
        #         else:
        #             sample_objectives[alg] = [0.0 for _ in objs]  # All same, set to 0

        # convert per-sample lists to numpy arrays and store in all_objectives
        for alg in algo_names:
            arr = np.array(sample_objectives[alg], dtype=float)
            if arr.size == 0:
                arr = np.zeros_like(cost_arr, dtype=float)
            all_objectives[alg].append(arr)
            # store total runtime per sample (sum over budgets or single value for prefix)
            runtimes = sample_runtimes.get(alg, [])
            total_runtime = float(np.nansum(np.array(runtimes, dtype=float))) if len(runtimes) > 0 else 0.0
            all_runtimes[alg].append(total_runtime)

        print(f"Completed sample {sample_index + 1}")

    print("All samples processed, computing statistics and plotting")

    # compute mean and std across samples for each algorithm
    mean_objectives = {}
    std_objectives = {}
    for alg in algo_names:
        stacked = np.vstack(all_objectives[alg])  # shape (numSamples, len(cost_arr))
        mean_objectives[alg] = np.mean(stacked, axis=0)
        std_objectives[alg] = np.std(stacked, axis=0) * 0.5

    # Plot mean objective with shaded std band
    # plotting: distinct linestyles + markers; plot prefix pareto last & bolder
    colors = cm.magma(np.linspace(0.01, 0.8, len(algo_names)))
    linestyles = ['--', '--', '--', '--', '-', '-']  # dashed for baselines, solid for PrefixPareto
    markers = ['o', 's', '^', 'v', 'D', 'X']
    markersizes = [3, 3, 3, 3, 5, 5]  # smaller for baselines, larger for PrefixPareto

    # ensure PrefixPareto lines plotted last
    plot_order = [a for a in algo_names if not a.startswith('PrefixPareto')] + \
                 [a for a in algo_names if a.startswith('PrefixPareto')]
    
    fig, ax = plt.subplots(figsize=(8, 5))
    for i, alg in enumerate(plot_order):
        mean = mean_objectives[alg]
        std = std_objectives[alg]

        ax.plot(cost_arr, mean,
                label=alg,
                color=colors[algo_names.index(alg)],
                linestyle=linestyles[algo_names.index(alg)],
                marker=markers[algo_names.index(alg)],
                markersize=markersizes[algo_names.index(alg)],
                markeredgewidth=0.8,
                markeredgecolor='k',
                linewidth=1.2,
                zorder=3)
        ax.fill_between(cost_arr,
                        np.clip(mean - std, 0, None),  # assuming objectives >=0
                        mean + std,
                        color=colors[algo_names.index(alg)],
                        alpha=0.18,
                        zorder=2)

    ax.set_xlabel('Cost')
    ax.set_ylabel('Mean Objective Value')
    ax.set_title('Mean Objective across samples (shaded = ±0.5 std)')
    ax.grid(alpha=0.3)
    ax.legend(fontsize=8, ncol=2)
    plt.show()

    # Plot total runtime per algorithm with mean +/- std bars
    means_rt = [np.mean(all_runtimes[alg]) for alg in algo_names]
    stds_rt = [np.std(all_runtimes[alg]) for alg in algo_names]
    x = np.arange(len(algo_names))
    fig2, ax2 = plt.subplots(figsize=(8, 3))
    bars = ax2.bar(x, means_rt, yerr=stds_rt, capsize=5, color=[colors[i] for i in range(len(algo_names))])
    ax2.set_xticks(x)
    ax2.set_xticklabels(algo_names, rotation=30, ha='right', fontsize=8)
    ax2.set_ylabel('Total Runtime (s)')
    ax2.set_title('Mean Total Runtime per Algorithm (±std)')
    ax2.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

    print("Plotting completed")

    return None

### Yelp Phoenix Experiments

In [None]:
#Import Yelp dataset
data_path = 'datasets/pickled_data/yelp/yelp_phoenix_'
    
#Import pickled data
with open(data_path + 'ids.pkl', "rb") as fp:
    phoenix_ids = pickle.load(fp)

with open(data_path + 'sim.pkl', "rb") as fp:
    phoenix_simMatrix = pickle.load(fp)

with open(data_path + 'costs.pkl', "rb") as fp:
    phoenix_costs = pickle.load(fp)


In [None]:
# Call findApproximateParetoSolutions (adjust parameters as needed)
sample_size = 50
numSamples = 5  # e.g., number of random samples
maxBudget = 30

findApproximateParetoSolutions(phoenix_simMatrix, phoenix_ids, phoenix_costs, sample_size, numSamples, maxBudget)

### Yelp Vegas Experiments

In [None]:
#Import Vegas dataset
data_path = 'datasets/pickled_data/yelp/yelp_vegas_' 

#Import pickled data
with open(data_path + 'ids.pkl', "rb") as fp:
    vegas_ids = pickle.load(fp)

with open(data_path + 'sim.pkl', "rb") as fp:
    vegas_simMatrix = pickle.load(fp)

with open(data_path + 'costs.pkl', "rb") as fp:
    vegas_costs = pickle.load(fp)

In [None]:
# Call findApproximateParetoSolutions (adjust parameters as needed)
sample_size = 50
numSamples = 5  # e.g., number of random samples
maxBudget = 30

findApproximateParetoSolutions(vegas_simMatrix, vegas_ids, vegas_costs, sample_size, numSamples, maxBudget)