In [None]:
from paretoCardinalityRestaurants import *

In [None]:
#function to sample numItems from the dataset
def sample_dataset(simMatrix, item_ids, numItems):
    '''
    Samples a subset of items from the dataset.
    Args:
        simMatrix (np.ndarray): Similarity matrix of shape (N, N).
        item_ids (list): List of item IDs of length N.
        numItems (int): Number of items to sample.
    Returns:
        sampled_simMatrix (np.ndarray): Sampled similarity matrix of shape (numItems, numItems).
        sampled_item_ids (list): List of sampled item IDs of length numItems.
    '''
    sampled_indices = np.random.choice(len(item_ids), size=numItems, replace=False)
    sampled_simMatrix = simMatrix[np.ix_(sampled_indices, sampled_indices)]
    sampled_item_ids = [item_ids[i] for i in sampled_indices]

    return sampled_simMatrix, sampled_item_ids


def findApproximateParetoSolutions(simMatrix, item_ids, sample_size, numSamples, k_max):
    '''
    Run algorithms over multiple random samples of the dataset, aggregate results, and plot mean +/- std.
    Args:
        simMatrix (np.ndarray): Similarity matrix of shape (N, N).
        item_ids (list): List of item IDs of length N.      
        sample_size (int): Number of items to sample from the dataset.
        numSamples (int): Number of random samples to run.
        k_max (int): Maximum cardinality.
    '''
    algo_names = ["ParetoGreedy", "TopK", "Random"]

    # containers across samples
    all_objectives = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}

    print(f"Starting processing {numSamples} samples with sample size {sample_size} and k_max {k_max}")

    # iterate samples
    for sample_index in range(numSamples):
        print(f"Processing sample {sample_index + 1}/{numSamples}")
        
        # per-sample containers
        sample_objectives = {alg: [] for alg in algo_names}
        sample_runtimes = {alg: [] for alg in algo_names}

        # sample the dataset
        sampled_simMatrix, sampled_item_ids = sample_dataset(simMatrix, item_ids, sample_size)

        # Initialize Pareto cardinality restaurants object
        paretoCard = paretoCardinalityRestaurants(n_items=sampled_item_ids,
                                                  simMatrix=sampled_simMatrix,
                                                  k_max=k_max)

        # Greedy Cardinality
        _, _, _, runTime = paretoCard.greedyCardinality()
        objectives = []
        current_objective = 0
        for k in range(1, k_max + 1):
            if k in paretoCard.kSolDict:
                current_objective = paretoCard.kSolDict[k]['Coverage']
            objectives.append(current_objective)
        sample_objectives['ParetoGreedy'] = objectives
        sample_runtimes['ParetoGreedy'] = runTime

        # Top K
        paretoCard2 = paretoCardinalityRestaurants(n_items=sampled_item_ids,
                                                   simMatrix=sampled_simMatrix,
                                                   k_max=k_max)
        _, _, _, runTime = paretoCard2.top_k()
        objectives = [paretoCard2.kSolDict[k]['Coverage'] for k in range(1, k_max + 1)]
        sample_objectives['TopK'] = objectives
        sample_runtimes['TopK'] = runTime

        # Random
        paretoCard3 = paretoCardinalityRestaurants(n_items=sampled_item_ids,
                                                   simMatrix=sampled_simMatrix,
                                                   k_max=k_max)
        _, _, _, runTime = paretoCard3.random_selection()
        objectives = [paretoCard3.kSolDict[k]['Coverage'] for k in range(1, k_max + 1)]
        sample_objectives['Random'] = objectives
        sample_runtimes['Random'] = runTime

        # convert per-sample lists to numpy arrays and store
        for alg in algo_names:
            arr = np.array(sample_objectives[alg], dtype=float)
            all_objectives[alg].append(arr)
            all_runtimes[alg].append(sample_runtimes[alg])

    # compute mean and std across samples for each algorithm
    mean_objectives = {}
    std_objectives = {}
    for alg in algo_names:
        stacked = np.vstack(all_objectives[alg])  # shape (numSamples, k_max)
        mean_objectives[alg] = np.mean(stacked, axis=0)
        std_objectives[alg] = np.std(stacked, axis=0) * 0.5

    # Plot mean objective with shaded std band
    colors = cm.magma(np.linspace(0.01, 0.8, len(algo_names)))
    linestyles = ['-', '--', ':']
    markers = ['o', 's', '^']

    fig, ax = plt.subplots(figsize=(8, 5))
    for i, alg in enumerate(algo_names):
        mean = mean_objectives[alg]
        std = std_objectives[alg]

        ax.plot(range(1, k_max + 1), mean,
                label=alg,
                color=colors[i],
                linestyle=linestyles[i],
                marker=markers[i],
                markersize=5,
                markeredgewidth=0.8,
                markeredgecolor='k',
                linewidth=1.2,
                zorder=3)
        ax.fill_between(range(1, k_max + 1),
                        np.clip(mean - std, 0, None),
                        mean + std,
                        color=colors[i],
                        alpha=0.18,
                        zorder=2)

    ax.set_xlabel('Cardinality (k)')
    ax.set_ylabel('Mean Objective')
    ax.set_title('Mean Objective across samples (shaded = ±0.5 std)')
    ax.grid(alpha=0.3)
    ax.legend(fontsize=8)
    plt.show()

    # Plot total runtime per algorithm with mean +/- std bars
    means_rt = [np.mean(all_runtimes[alg]) for alg in algo_names]
    stds_rt = [np.std(all_runtimes[alg]) for alg in algo_names]
    x = np.arange(len(algo_names))
    fig2, ax2 = plt.subplots(figsize=(8, 3))
    bars = ax2.bar(x, means_rt, yerr=stds_rt, capsize=5, color=[colors[i] for i in range(len(algo_names))])
    ax2.set_xticks(x)
    ax2.set_xticklabels(algo_names, rotation=30, ha='right', fontsize=8)
    ax2.set_ylabel('Total Runtime (s)')
    ax2.set_title('Mean Total Runtime per Algorithm (±std)')
    ax2.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

    return None

### Yelp Phoenix Experiments

In [None]:
#Import Yelp Phoenix dataset
data_path = '../../datasets/pickled_data/yelp/yelp_phoenix_'
    
#Import pickled data
with open(data_path + 'ids.pkl', "rb") as fp:
    phoenix_ids = pickle.load(fp)

with open(data_path + 'sim.pkl', "rb") as fp:
    phoenix_simMatrix = pickle.load(fp)

In [None]:
#Adjust parameters as needed
sample_size = 500
numSamples = 20  #number of random samples
kmax = 50

findApproximateParetoSolutions(phoenix_simMatrix, phoenix_ids, sample_size, numSamples, kmax)

### Yelp Vegas Experiments

In [None]:
#Import Yelp Vegas dataset
data_path = '../../datasets/pickled_data/yelp/yelp_vegas_'
    
#Import pickled data
with open(data_path + 'ids.pkl', "rb") as fp:
    vegas_ids = pickle.load(fp)

with open(data_path + 'sim.pkl', "rb") as fp:
    vegas_simMatrix = pickle.load(fp)

In [None]:
#Adjust parameters as needed
sample_size = 500
numSamples = 20  #number of random samples
kmax = 50

findApproximateParetoSolutions(vegas_simMatrix, vegas_ids, sample_size, numSamples, kmax)