In [None]:
from paretoKnapsackInfluence import *
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# Import influence datasets
data_path_HEPT = '../../datasets/raw_data/influence/NetHEPT/hep.txt'
data_path_PHY = '../../datasets/raw_data/influence/NetPHY/phy.txt'

G_HEPT, node_costs_HEPT = import_influence_data(data_path_HEPT)
G_PHY, node_costs_PHY = import_influence_data(data_path_PHY)

In [None]:
def findApproximateParetoSolutionsKnapsackInfluence(G, node_costs, maxBudget, num_samples=35, num_runs=10, dataset_name=""):
    '''
    Run algorithms over multiple runs and plot mean +/- std (Influence vs Cost + Runtime).
    Parameters:
    - G: Graph
    - node_costs: Node costs dict
    - maxBudget: Maximum knapsack budget
    - num_samples: Number of Monte Carlo samples per run
    - num_runs: Number of independent runs
    - dataset_name: Name of the dataset for plotting
    '''
    # Cost grid (same pattern as teams/restaurants notebooks)
    eps, min_cost = 0.1, 1
    cost_arr = [min_cost]
    while min_cost * (1 + eps) < maxBudget:
        min_cost = round(min_cost * (1 + eps), 2)
        cost_arr.append(min_cost)
    cost_arr.append(maxBudget)
    cost_arr = np.array(cost_arr)

    algo_names = ["PlainGreedy", "GreedyPlus", "TwoGuessPlainGreedy", "OneGuessGreedyPlus", "PrefixPareto-1Guess", "PrefixPareto-2Guess"]

    # containers across runs
    all_influences = {alg: [] for alg in algo_names}
    all_runtimes = {alg: [] for alg in algo_names}

    for _ in range(num_runs):
        # Generate graph samples once to share across algorithms (per run)
        graph_samples = []
        for _ in range(num_samples):
            G_sample = nx.Graph()
            neighbors = defaultdict(set)
            connected_components = defaultdict()
            for u, v, data in G.edges(data=True):
                success = np.random.uniform(0, 1)
                if success < data['weight']:
                    G_sample.add_edge(u, v)
                    neighbors[u].add(v)
                    neighbors[v].add(u)
            for c in nx.connected_components(G_sample):
                for node in c:
                    connected_components[node] = c
            graph_samples.append((G_sample, neighbors, connected_components))

        run_influences = {alg: [] for alg in algo_names}
        run_runtimes = {alg: [] for alg in algo_names}

        # Run algorithms over budgets
        for budgetVal in cost_arr:
            pareto = paretoKnapsackInfluence(G=G,
                                             node_costs=node_costs,
                                             budget=budgetVal,
                                             num_samples=num_samples,
                                             graph_samples=graph_samples)

            # Plain Greedy
            _, infl, cost, runTime = pareto.plainGreedy()
            run_influences['PlainGreedy'].append(infl)
            run_runtimes['PlainGreedy'].append(runTime)

            # Greedy Plus
            _, infl, cost, runTime = pareto.greedyPlus()
            run_influences['GreedyPlus'].append(infl)
            run_runtimes['GreedyPlus'].append(runTime)

            # Two Guess Plain Greedy
            _, infl, cost, runTime = pareto.twoGuessPlainGreedy()
            run_influences['TwoGuessPlainGreedy'].append(infl)
            run_runtimes['TwoGuessPlainGreedy'].append(runTime)

            # One Guess Greedy Plus
            _, infl, cost, runTime = pareto.oneGuessGreedyPlus()
            run_influences['OneGuessGreedyPlus'].append(infl)
            run_runtimes['OneGuessGreedyPlus'].append(runTime)

        # Prefix Pareto (computed once per run at full budget)
        pareto_full = paretoKnapsackInfluence(G=G,
                                              node_costs=node_costs,
                                              budget=maxBudget,
                                              num_samples=num_samples,
                                              graph_samples=graph_samples)

        pp1_costs, pp1_influences, _, pp1_time = pareto_full.prefixParetoGreedy_1Guess()
        # pp2_costs, pp2_influences, _, pp2_time = pareto_full.prefixParetoGreedy_2Guess()

        # Align prefix pareto curves to cost_arr by interpolation
        def align_to_cost_arr(costs, infls):
            if len(costs) == 0:
                return np.zeros_like(cost_arr, dtype=float)
            costs = np.array(costs)
            infls = np.array(infls)
            return np.interp(cost_arr, costs, infls, left=infls[0], right=infls[-1])

        run_influences['PrefixPareto-1Guess'] = list(align_to_cost_arr(pp1_costs, pp1_influences))
        # run_influences['PrefixPareto-2Guess'] = list(align_to_cost_arr(pp2_costs, pp2_influences))
        run_runtimes['PrefixPareto-1Guess'].append(pp1_time)
        # run_runtimes['PrefixPareto-2Guess'].append(pp2_time)

        # store per-run arrays and total runtimes
        for alg in algo_names:
            arr = np.array(run_influences[alg], dtype=float)
            if arr.size == 0:
                arr = np.zeros_like(cost_arr, dtype=float)
            all_influences[alg].append(arr)
            total_runtime = float(np.nansum(np.array(run_runtimes.get(alg, []), dtype=float))) if len(run_runtimes.get(alg, [])) > 0 else 0.0
            all_runtimes[alg].append(total_runtime)

    # compute mean and std across runs for each algorithm
    mean_influences = {}
    std_influences = {}
    for alg in algo_names:
        stacked = np.vstack(all_influences[alg])  # shape (num_runs, len(cost_arr))
        mean_influences[alg] = np.mean(stacked, axis=0)
        std_influences[alg] = np.std(stacked, axis=0) * 0.5

    # Plot mean influence with shaded std band
    colors = cm.magma(np.linspace(0.01, 0.8, len(algo_names)))
    linestyles = ['-', '--', '-.', ':', (0, (3,1,1,1)), (0, (1,1))]
    markers = ['o', 's', '^', 'v', 'D', 'X']

    plot_order = [a for a in algo_names if not a.startswith('PrefixPareto')] + \
                 [a for a in algo_names if a.startswith('PrefixPareto')]

    fig, ax = plt.subplots(figsize=(8, 5))
    for i, alg in enumerate(plot_order):
        mean = mean_influences[alg]
        std = std_influences[alg]

        ax.plot(cost_arr, mean,
                label=alg,
                color=colors[algo_names.index(alg)],
                linestyle=linestyles[i % len(linestyles)],
                marker=markers[i % len(markers)],
                markersize=5,
                markeredgewidth=0.8,
                markeredgecolor='k',
                linewidth=1.2,
                zorder=3)
        ax.fill_between(cost_arr,
                        np.clip(mean - std, 0, None),
                        mean + std,
                        color=colors[algo_names.index(alg)],
                        alpha=0.18,
                        zorder=2)

    ax.set_title(f'Mean Influence vs. Cost ({dataset_name}) (shaded = ±0.5 std)')
    ax.set_ylabel("Mean Influence")
    ax.set_xlabel("Cost Budget")
    ax.grid(alpha=0.3)
    ax.legend(fontsize=8, ncol=2)
    plt.show()

    # Plot total runtime per algorithm with mean +/- std bars
    means_rt = [np.mean(all_runtimes[alg]) for alg in algo_names]
    stds_rt = [np.std(all_runtimes[alg]) for alg in algo_names]
    x = np.arange(len(algo_names))
    fig2, ax2 = plt.subplots(figsize=(8, 3))
    bars = ax2.bar(x, means_rt, yerr=stds_rt, capsize=5, color=[colors[i] for i in range(len(algo_names))])
    ax2.set_xticks(x)
    ax2.set_xticklabels(algo_names, rotation=30, ha='right', fontsize=8)
    ax2.set_ylabel('Total Runtime (s)')
    ax2.set_title('Mean Total Runtime per Algorithm (±std)')
    ax2.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

    return mean_influences, cost_arr, all_runtimes

### NetHEPT Experiments

In [None]:
# Parameters
maxBudget = 6
num_samples = 10
num_runs = 5


In [None]:

# Run for NetHEPT
influences_HEPT, costs_HEPT, runtimes_HEPT = findApproximateParetoSolutionsKnapsackInfluence(
    G_HEPT,
    node_costs_HEPT,
    maxBudget,
    num_samples=num_samples,
    num_runs=num_runs,
    dataset_name="NetHEPT"
)

### NetPHY Experiments

In [None]:
# Run for NetPHY
influences_PHY, costs_PHY, runtimes_PHY = findApproximateParetoSolutionsKnapsackInfluence(
    G_PHY,
    node_costs_PHY,
    maxBudget,
    num_samples=num_samples,
    num_runs=num_runs,
    dataset_name="NetPHY"
)