In [2]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.stats import norm

# Define the root directory containing the subfolders
root_dir = os.path.expanduser("~/fdp-estimation/results/results_SGD")

# Find all subfolders in the root directory
subfolders = [f.path for f in os.scandir(root_dir) if f.is_dir()]

# Placeholder for storing overall results
overall_results = []

In [1]:
import numpy as np
from scipy.stats import norm
from itertools import combinations

def calc_mu(x, m, eta_learn, T_):
    """
    Calculate mu based on the given parameters.
    """
    mu = np.sum(eta_learn * (1 - eta_learn)**(T_ - np.array(x)) / m)
    return mu

# Initialize mu_vector
mu_vector = []

# Define parameters
n=10
sigma = 0.2
theta_0 = 0
m = 5
eta_learn = 0.2
T_ = 10
x1 = np.zeros(n)
x2 = np.array([1] + [0] * (n-1))

h = 0.1
eta_max = 15
for k in range(1, T_ + 1):
    k_combinations = list(combinations(range(1, T_ + 1), k))
    mu_values = [calc_mu(x, m=m, eta_learn=eta_learn, T_=T_) for x in k_combinations]
    mu_vector.extend(mu_values)

# Calculate sigma_tilde
sigma_tilde = eta_learn * sigma * np.sqrt((1 - (1 - eta_learn)**(2 * T_)) / (1 - (1 - eta_learn)**2))

def SGD_Curve(alpha):
    """
    Compute the SGD curve for a given alpha.
    """
    sgd_sum = np.sum(norm.cdf(norm.ppf(1 - alpha) - np.array(mu_vector) / sigma_tilde) / 2**T_)
    return sgd_sum



In [2]:
SGD_Curve(alpha=0.2)

0.3252972912981455

In [19]:
# Iterate through each subfolder
for subfolder in subfolders:
    print(f"Processing subfolder: {subfolder}")

    # Find all CSV files in the current subfolder
    csv_files = glob.glob(os.path.join(subfolder, "*.csv"))

    # Placeholder for storing subfolder-specific results
    subfolder_results = []

    for file in csv_files:
        # Read the CSV file
        data = pd.read_csv(file)

        # Ensure the CSV contains 'alpha' and 'beta' columns
        if 'alpha' not in data.columns or 'beta' not in data.columns:
            print(f"Skipping {file}: Missing required columns 'alpha' and 'beta'.")
            continue

        # Filter the data to only include rows where 0 <= alpha <= 1
        filtered_data = data[(data['alpha'] >= 0) & (data['alpha'] <= 1)]

        # Skip if no valid rows remain
        if filtered_data.empty:
            print(f"Skipping {file}: No valid alpha values between 0 and 1.")
            continue

        # Compute the maximum error for the current file
        alpha_values = filtered_data['alpha'].values
        beta_values = filtered_data['beta'].values
        f_values = np.array([SGD_Curve(alpha) for alpha in alpha_values])
        max_error = np.max(np.abs(f_values - beta_values))

        # Store the result
        result = {
            "file": file,
            "max_error": max_error
        }
        subfolder_results.append(result)

    # Convert subfolder results to a DataFrame
    subfolder_df = pd.DataFrame(subfolder_results)


    # Save the subfolder-specific results to a new CSV
    output_file = os.path.join(subfolder, "max_errors.csv")
    subfolder_df.to_csv(output_file, index=False)
    print(f"Saved maximum errors for subfolder: {subfolder}")

    # Append to overall results
    overall_results.extend(subfolder_results)

# Convert overall results to a DataFrame
overall_df = pd.DataFrame(overall_results)

# Save the overall results
overall_output_file = os.path.join(root_dir, "overall_max_errors.csv")
overall_df.to_csv(overall_output_file, index=False)
print(f"Saved overall maximum errors to: {overall_output_file}")

# Extract alpha and beta for plotting
#alpha = output_df["alpha"]
#beta = output_df["beta"]

Processing subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N10000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N10000
Processing subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N1000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N1000
Processing subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N1000000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N1000000
Processing subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N100
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N100
Processing subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N100000
Saved maximum errors for subfolder: /home/martin/fdp-estimation/results/results_SGD/results_SGD_N100000
Saved overall maximum errors to: /home/m

In [4]:
def compute_mse_from_csv(file_path):
    data = pd.read_csv(file_path)
    errors = data["max_error"].values
    
    # Compute the Mean Squared Error
    mse = np.mean(errors)**2+np.var(errors)
    return mse
mse_100=compute_mse_from_csv("~/fdp-estimation/results/results_SGD/results_SGD_N100/max_errors.csv")
mse_1000=compute_mse_from_csv("~/fdp-estimation/results/results_SGD/results_SGD_N1000/max_errors.csv")
mse_10000=compute_mse_from_csv("~/fdp-estimation/results/results_SGD/results_SGD_N10000/max_errors.csv")
mse_100000=compute_mse_from_csv("~/fdp-estimation/results/results_SGD/results_SGD_N100000/max_errors.csv")
mse_1000000=compute_mse_from_csv("~/fdp-estimation/results/results_SGD/results_SGD_N1000000/max_errors.csv")
mse_values = {
    "N": [100, 1000, 10000, 100000, 1000000],
    "Mean Squared Error": [
        # Replace these with the actual MSE values if known
        mse_100, 
        mse_1000, 
        mse_10000, 
        mse_100000, 
        mse_1000000
    ]
}
mse_df = pd.DataFrame(mse_values)
mse_df.to_csv("mse_values.csv", index=False)