In [15]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.stats import norm

# Define the root directory containing the subfolders
root_dir = os.path.expanduser("~/fdp-estimation/results")

# Find all subfolders in the root directory
subfolders = [f.path for f in os.scandir(root_dir) if f.is_dir()]

# Placeholder for storing overall results
overall_results = []

In [16]:
# Define Gaussian curve function
mu_1 = 0
mu_2 = 1


def Gaussian_curve(alpha):
    return norm.cdf(norm.ppf(1 - alpha) - mu_2)

In [None]:
# Iterate through each subfolder
for subfolder in subfolders:
    print(f"Processing subfolder: {subfolder}")

    # Find all CSV files in the current subfolder
    csv_files = glob.glob(os.path.join(subfolder, "*.csv"))

    # Placeholder for storing subfolder-specific results
    subfolder_results = []

    for file in csv_files:
        # Read the CSV file
        data = pd.read_csv(file)

        # Ensure the CSV contains 'alpha' and 'beta' columns
        if 'alpha' not in data.columns or 'beta' not in data.columns:
            print(f"Skipping {file}: Missing required columns 'alpha' and 'beta'.")
            continue

        # Filter the data to only include rows where 0 <= alpha <= 1
        filtered_data = data[(data['alpha'] >= 0) & (data['alpha'] <= 1)]

        # Skip if no valid rows remain
        if filtered_data.empty:
            print(f"Skipping {file}: No valid alpha values between 0 and 1.")
            continue

        # Compute the maximum error for the current file
        alpha_values = filtered_data['alpha'].values
        beta_values = filtered_data['beta'].values
        f_values = Gaussian_curve(alpha_values)
        max_error = np.max(np.abs(f_values - beta_values))

        # Store the result
        result = {
            "file": file,
            "max_error": max_error
        }
        subfolder_results.append(result)

    # Convert subfolder results to a DataFrame
    subfolder_df = pd.DataFrame(subfolder_results)


    # Save the subfolder-specific results to a new CSV
    output_file = os.path.join(subfolder, "max_errors.csv")
    subfolder_df.to_csv(output_file, index=False)
    print(f"Saved maximum errors for subfolder: {subfolder}")

    # Append to overall results
    overall_results.extend(subfolder_results)

# Convert overall results to a DataFrame
overall_df = pd.DataFrame(overall_results)

# Save the overall results
overall_output_file = os.path.join(root_dir, "overall_max_errors.csv")
overall_df.to_csv(overall_output_file, index=False)
print(f"Saved overall maximum errors to: {overall_output_file}")

# Extract alpha and beta for plotting
#alpha = output_df["alpha"]
#beta = output_df["beta"]

Processing subfolder: /home/martin/fdp-estimation/results/results_N10000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_N10000
Processing subfolder: /home/martin/fdp-estimation/results/results_N1000000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_N1000000
Processing subfolder: /home/martin/fdp-estimation/results/results_laplace_N100000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_laplace_N100000
Processing subfolder: /home/martin/fdp-estimation/results/results_subsampling_N1000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_subsampling_N1000
Processing subfolder: /home/martin/fdp-estimation/results/results_subsampling_N1000000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_subsampling_N1000000
Processing subfolder: /home/martin/fdp-estimation/results/results_N1000
Saved maximum errors for subfolder: /home/martin/fdp-est

NameError: name 'output_df' is not defined

In [None]:
# Plot the results
plt.figure(figsize=(8, 6))
plt.scatter(alpha, beta, color="firebrick", s=10, label="KDE Estimator")
alpha_values = np.linspace(0, 1, 1000)
plt.plot(alpha_values, Gaussian_curve(alpha_values), color="deepskyblue", linewidth=1.5, label="Gaussian Curve")
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.xlabel("Alpha")
plt.ylabel("Beta")
plt.title("KDE Estimator vs Gaussian Curve")
plt.legend()
plt.grid()

# Save the plot
#output_filename = f"L_output_plot_N{N}_h_{h:.1f}.png"
#plt.savefig(output_filename, dpi=300)
plt.show()