In [1]:
import numpy as np
import os
import sys
import time
import logging
import matplotlib.pyplot as plt

# Navigate to the parent directory of the project structure
project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_dir = os.path.join(project_dir, 'src')

# Add the src directory to sys.path
sys.path.append(src_dir)

import mech.GaussianDist as GaussianModule
import mech.LapDist as LaplaceModule
import mech.toy_DPSGD as DP_SGDModule
import mech.Subsampling as SubsamplingModule

In [2]:
from scipy.stats import norm
# Define Gaussian curve function
mu_1 = 0
mu_2 = 1
def Gaussian_curve(alpha):
    return norm.cdf(norm.ppf(1 - alpha) - mu_2)

In [4]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d


# Find the specific subfolder "results_N1000"
results_path = "~/Documents/R/f-DP/results/results_Gaussian/results_N10000"
subfolder = os.path.expanduser(results_path)  

if not subfolder:
    print("Subfolder 'results_N1000' not found.")
else:
    print(f"Processing subfolder: {subfolder}")

    # Find all CSV files in the specific subfolder
    csv_files = glob.glob(os.path.join(subfolder, "*.csv"))

    # Placeholder for storing subfolder-specific results
    all_results = []

    # Generate 500 equidistant points between 0 and 1
    target_points = np.linspace(0, 1, 500)

    # Compute the Gaussian curve (true curve) values at target points
    gausspoints = Gaussian_curve(target_points)

    for file in csv_files:
        # Read the CSV file
        data = pd.read_csv(file)

        # Ensure the CSV contains 'alpha' and 'beta' columns
        if 'alpha' not in data.columns or 'beta' not in data.columns:
            print(f"Skipping {file}: Missing required columns 'alpha' and 'beta'.")
            continue

        # Filter the data to only include rows where 0 <= alpha <= 1
        filtered_data = data[(data['alpha'] >= 0) & (data['alpha'] <= 1)]
        filtered_data = filtered_data.drop_duplicates(subset=['alpha'], keep='first')
        # Skip if no valid rows remain
        if filtered_data.empty:
            print(f"Skipping {file}: No valid alpha values between 0 and 1.")
            continue

        # Extract alpha and beta values
        alpha_values = filtered_data['alpha'].values
        beta_values = filtered_data['beta'].values

        # Perform interpolation
        try:
            interpolation_function = interp1d(alpha_values, beta_values, kind='linear', fill_value="extrapolate")
        except ValueError as e:
            print(f"Skipping {file}: Interpolation error - {e}")
            continue

        # Evaluate interpolated values on the 500 equidistant points
        interpolated_beta_values = interpolation_function(target_points)

        # Store results for this file
        all_results.append(interpolated_beta_values)

    if all_results:
        # Convert list of interpolated results into a NumPy array for easy computation
        all_results_array = np.array(all_results)  # Shape: (num_files, 500)

        # Compute the deviation matrix: (1000 estimators x 500 points)
        deviation_matrix = all_results_array - gausspoints

        # Output the shape of the deviation matrix
        print(f"Deviation matrix shape: {deviation_matrix.shape}")

        # Optionally save or visualize the deviation matrix
    else:
        print("No valid data processed in the subfolder.")

# Enforce boundary conditions
#min_curve[0] = 1
#max_curve[0] = 1
#min_curve[-1] = 0
#max_curve[-1] = 0

Processing subfolder: /home/martin/Documents/R/f-DP/results/results_Gaussian/results_N10000
Skipping /home/martin/Documents/R/f-DP/results/results_Gaussian/results_N10000/max_errors.csv: Missing required columns 'alpha' and 'beta'.
Deviation matrix shape: (1000, 500)


In [5]:
trimmed_matrix = deviation_matrix[:, 1:-1]  # Slicing to exclude first and last columns
min_indices = np.argmin(trimmed_matrix, axis=1)  # Indices in the trimmed matrix

In [6]:
eta_max=15
eta_vector = np.linspace(0, eta_max, 1000)
#Find row indices of the minimum and maximum values (column-wise)
trimmed_matrix = deviation_matrix[:, 1:-1]  # Slicing to exclude first and last columns
min_indices = np.min(trimmed_matrix, axis=1)  # Indices in the trimmed matrix

# Combine row and column indices
min_locations = np.argmin(trimmed_matrix, axis=1)
#max_locations = [(max_indices[i], i) for i in range(len(max_indices))]
eta_values= eta_vector[(min_locations+1)*2] 
print(np.sum(eta_values>0))

1000


## 1 Example code of Gaussian Mechanism

In [7]:
test_train_sample_size = 100000
test_test_sample_size = 100000

eta=np.array([eta_values[0]])

kwargs = GaussianModule.generate_params(num_train_samples = test_train_sample_size, num_test_samples = test_test_sample_size)
estimator = GaussianModule.GaussianDistEstimator(kwargs)
output = estimator.build(eta = eta)
beta_estimate = output["beta"]

In [8]:
print(beta_estimate)

[0.41576]


### 1.1 Parameters of the tested Gaussian (with N(0, 1) and N(1, 1))

In [3]:
kwargs

{'dist': {'mean0': array([0]),
  'cov0': array([[1]]),
  'mean1': array([1]),
  'cov1': array([[1]])},
 'num_train_samples': 100000,
 'num_test_samples': 100000}

## 2 Example code of other mechanisms, which have the similar API 

In [3]:
kwargs = LaplaceModule.generate_params(num_train_samples = test_train_sample_size, num_test_samples = test_test_sample_size)
estimator = LaplaceModule.LapDistEstimator(kwargs)
output = estimator.build(eta = eta)
beta_estimate = output["beta"]

In [6]:
kwargs = DP_SGDModule.generate_params(num_train_samples = 100000, num_test_samples = 100000)
estimator = DP_SGDModule.toy_DPSGDEstimator(kwargs)
output = estimator.build(eta = eta)
beta_estimate = output["beta"]

In [9]:
kwargs = SubsamplingModule.generate_params(num_train_samples = 100000, num_test_samples = 100000)
estimator = SubsamplingModule.SubsamplingEstimator(kwargs)
output = estimator.build(eta = eta)
beta_estimate = output["beta"]

## 3 Set parameters for theoretical accuracy bound

Below says we need num_train_samples = 10^9 and num_test_samples = 10^7 to get error within 10^-3 with probability gamma = 0.05

Yet, in fact, much less samples needed, the theoretical bound should be able to improve

In [9]:
def compute_expression(n, gamma):
    c_d = 3.8637  # Given value of c_d
    result = 12 * np.sqrt((2 * c_d ** 2 / n) * np.log(4 / gamma))
    return result

# Example usage:
n = 10**9  # Example value for n
gamma = 0.05  # Example value for gamma
print(compute_expression(n, gamma))

0.004340473891924778


In [None]:

# Fixed parameters
train_sample_size = 10**9
test_sample_size = 10**7

# Generate parameters
kwargs = GaussianModule.generate_params(
    num_train_samples=train_sample_size, 
    num_test_samples=test_sample_size
)

# Initialize the estimator
estimator = GaussianModule.GaussianDistEstimator(kwargs)

# Loop through all eta_values
beta_estimates = []
for eta in eta_values:
    # Wrap eta in an array since `estimator.build` expects an array
    eta_array = np.array([eta])
    output = estimator.build(eta=eta_array)
    beta_estimate = output["beta"]
    beta_estimates.append(beta_estimate)

# Convert beta_estimates to a NumPy array for further analysis
beta_estimates = np.array(beta_estimates)
result= beta_estimates+compute_expression(n, gamma)<Gaussian_curve
print("Beta estimates for all eta values:", beta_estimates)


In [2]:
import time

# Placeholder for KDE_Estimator function and its parameters
# Ensure KDE_Estimator and all parameters (eta_max, Sum_Gauss, x1, x2, N, h) are defined

# Initialize variables
num_runs = 5
running_times = []

# Run the function 10 times and record running times
for _ in range(num_runs):
    start_time = time.time()
    kwargs = SubsamplingModule.generate_params(num_train_samples = 1000000, num_test_samples = 1000000)
    estimator = SubsamplingModule.SubsamplingEstimator(kwargs)
    output = estimator.build(eta = 1)
    end_time = time.time()
    running_times.append(end_time - start_time)

# Compute average running time
average_time = sum(running_times) / num_runs

# Display the results
print(f"Average Running Time: {average_time:.6f} seconds")
print(f"Individual Running Times: {running_times}")

Average Running Time: 66.983505 seconds
Individual Running Times: [52.63915729522705, 51.10759377479553, 72.97279334068298, 80.17493963241577, 78.02304124832153]
