In [1]:
import os

from seml import get_results
import matplotlib.pyplot as plt
import torch
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from dp_timeseries.privacy.pld import (DoubleMixtureGaussianPrivacyLoss, PerfectPrivacyLoss,
                  SwitchingPrivacyLoss, WeightedSumPrivacyLoss)

from dp_accounting.pld.privacy_loss_distribution import (
    PrivacyLossDistribution, _create_pld_pmf_from_additive_noise)

from dp_accounting.pld.privacy_loss_mechanism import (
    AdditiveNoisePrivacyLoss, AdjacencyType, GaussianPrivacyLoss,
    MixtureGaussianPrivacyLoss)

from scipy.stats import binom, hypergeom

from itertools import product

import numpy as np

In [2]:
epsilons = np.logspace(-3, 3, 1001)

In [3]:
def get_deltas_bilevel(epsilons, num_bad: int, num_total: int, batch_size: int, num_compositions: int):
    noise_multiplier = 1.0
    sensitivities = np.array([0.0, 2.0])
    sampling_prob = min(batch_size * num_bad / num_total, 1.0)
    sampling_probs = np.array([1 - sampling_prob, sampling_prob])

    privacy_loss = SwitchingPrivacyLoss(
            epsilon_threshold=0.0,
            below_threshold_pl=MixtureGaussianPrivacyLoss(
                noise_multiplier,
                sensitivities,
                sampling_probs,
                adjacency_type=AdjacencyType.ADD
            ),
            above_threshold_pl=MixtureGaussianPrivacyLoss(
                noise_multiplier,
                sensitivities,
                sampling_probs,
                adjacency_type=AdjacencyType.REMOVE
            )
        )
    
    print('-----')
    print(sensitivities, sampling_probs)
    print('-----')

    pld_pmf = _create_pld_pmf_from_additive_noise(
        privacy_loss,
        value_discretization_interval=1e-3,
        use_connect_dots=True
    )

    pld = PrivacyLossDistribution(pld_pmf)
    if num_compositions > 1:
        pld_composed = pld.self_compose(num_compositions - 1)
    else:
        pld_composed = pld

    return pld_composed.get_delta_for_epsilon(epsilons)

In [4]:
def get_deltas_wor(epsilons, num_bad: int, num_total: int, batch_size: int, num_compositions: int):

    noise_multiplier = 1.0

    sensitivities = np.arange(num_bad + 1)
    sampling_probs = hypergeom.pmf(sensitivities, num_total, num_bad, batch_size)
    assert np.isclose(sampling_probs.sum(), 1.0)
    sensitivities *= 2

    privacy_loss = SwitchingPrivacyLoss(
            epsilon_threshold=0.0,
            below_threshold_pl=MixtureGaussianPrivacyLoss(
                noise_multiplier,
                sensitivities,
                sampling_probs,
                adjacency_type=AdjacencyType.ADD
            ),
            above_threshold_pl=MixtureGaussianPrivacyLoss(
                noise_multiplier,
                sensitivities,
                sampling_probs,
                adjacency_type=AdjacencyType.REMOVE
            )
        )
    
    print('-----')
    print(sensitivities, sampling_probs)
    print('-----')

    pld_pmf = _create_pld_pmf_from_additive_noise(
        privacy_loss,
        value_discretization_interval=1e-3,
        use_connect_dots=True
    )

    pld = PrivacyLossDistribution(pld_pmf)
    if num_compositions > 1:
        pld_composed = pld.self_compose(num_compositions - 1)
    else:
        pld_composed = pld

    return pld_composed.get_delta_for_epsilon(epsilons)

In [None]:
num_totals = [10 ** 4, 10 ** 6]
relative_batch_sizes = [0.1, 0.01, 0.001]

num_bads = [1, 2, 4, 8, 16, 32]
modes = ['bilevel', 'wor']

df_results = []

for num_total, relative_batch_size, num_bad, mode in product(
        num_totals, relative_batch_sizes, num_bads, modes):

    batch_size = int(num_total * relative_batch_size)

    print(num_total, batch_size, num_bad, mode)

    if mode == 'bilevel':
        deltas = get_deltas_bilevel(epsilons, num_bad, num_total, batch_size, 100)
    else:
        deltas = get_deltas_wor(epsilons, num_bad, num_total, batch_size, 100)

    df_results.append({
        'mode': mode,
        'num_bad': num_bad,
        'num_total': num_total,
        'batch_size': batch_size,
        'epsilons': epsilons,
        'deltas': deltas
    })

df_results = pd.DataFrame(df_results)

In [None]:
df_results

In [None]:
df_results.to_parquet('./wor_top_wr_bottom.pq')

In [11]:
df_results = pd.read_parquet('./wor_top_wr_bottom.pq')

In [16]:
def plot_tradeoff(df, num_total, relative_batch_size, num_bad_filter=None, xlim=None, ylim=None, plot_legend=False):

    fig, ax = plt.subplots(1, 1)


    batch_size = int(num_total * relative_batch_size)

    df = df.copy()

    df = df.loc[(df['batch_size'] == batch_size)
                & (df['num_total'] == num_total)]
    
    if num_bad_filter is not None:
        df = df.loc(df['instances_per_sequence'].isin(num_bad_filter))

    df = df[['mode', 'num_bad', 'epsilons', 'deltas']]
    df = df.sort_values('num_bad')

    
    pal = sns.color_palette('colorblind', len(df['num_bad'].unique()))

    for _, (mode, num_bad, epsilons, deltas) in df.iterrows():

        if xlim is not None:
            deltas = deltas[epsilons <= xlim]
            epsilons = epsilons[epsilons <= xlim]
        
        color = pal[int(np.log2(num_bad))]

        linestyle = 'solid' if (mode == 'bilevel') else 'dashed'
        ax.plot(epsilons, deltas, linestyle=linestyle, zorder=3, color=color)

        if mode == 'bilevel':
            # Dummy lines for legend
            ax.plot([], [], label=f'{num_bad}', linestyle='solid', zorder=3, color=color)

    if xlim is not None:
        ax.set_xlim(left=epsilons.min(), right=xlim)
    else:
        ax.set_xlim(left=epsilons.min(), right=epsilons.max())
    ax.set_ylim(bottom=0)
    #ax.set_xscale('log')
    #plt.yscale('log')
    if ylim is not None:
        ax.set_ylim(top=ylim)

    ax.minorticks_off()

    if plot_legend:
        legend_1 = ax.legend(title='$L_C + L_F$', title_fontsize=10, loc='upper right')
    ax.set_xlabel('$\epsilon$')
    ax.set_ylabel('$\delta(\epsilon)$')

    linestyles = ['solid', 'dashed']
    labels = ['Structured Subsampling', 'Standard DP-SGD']

    if plot_legend:
        dummy_lines = []
        for linestyle, label in zip(linestyles, labels):
            dummy_lines.append(ax.plot([],[], c="black", linestyle=linestyle)[0])
        ax.legend(dummy_lines, labels, loc='upper left')

        # Add old legend back
        ax.add_artist(legend_1)

In [17]:
num_totals = [10 ** 4, 10 ** 4, 10 ** 6, 10 **6]
relative_batch_sizes = [0.1, 0.001, 0.1, 0.001]
xlims = [400, 5, 400, 5]
ylims = [1.0, 0.5, 1.0, 0.5]

save_dir = '/ceph/hdd/staff/schuchaj/dp_timeseries_plots_camera_icml25/eval_pld_bilevel_vs_blackbox'

for num_total, relative_batch_size, xlim, ylim in zip(num_totals, relative_batch_sizes,
                                                      xlims, ylims):

    plot_tradeoff(df_results, num_total, relative_batch_size, num_bad_filter=None, xlim=xlim, ylim=ylim, plot_legend=True)