In [None]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Step 1: Load the dataset
file_path = "combined_spar_data_full_parameters_split.csv"
df = pd.read_csv(file_path)

In [None]:
# Step 2: Define features and labels
# Features - based on your provided images (excluding ib and ic which are now labels)
feature_columns = [
    "freq",  # Frequency
    "vb",
    "vc",  # Voltage parameters
    "DEV_GEOM_L",
    "NUM_OF_TRANS_RF",  # Device geometry
    # "gm",
    # "Cpi",
    # "Cmu",  # Device parameters
    # "Zin_real",
    # "Zin_imag",
    # "Zout_real",  # Impedance components
]

# Labels - de-embedded S-parameters

s_parameter_labels = [
    "S_deemb(1,1)_real",
    "S_deemb(1,1)_imag",
    "S_deemb(1,2)_real",
    "S_deemb(1,2)_imag",
    "S_deemb(2,1)_real",
    "S_deemb(2,1)_imag",
    "S_deemb(2,2)_real",
    "S_deemb(2,2)_imag",
]


In [None]:
# Step 3: Check for null values in both features and labels
print("Checking for null values in features:")
feature_nulls = df[feature_columns].isnull().sum()
print(feature_nulls[feature_nulls > 0])  # Only show features with nulls

print("\nChecking for null values in labels:")
label_nulls = df[s_parameter_labels].isnull().sum()
print(label_nulls)

In [None]:
# Step 4: Filter rows with any null values in features or labels
df_clean = df.dropna(subset=feature_columns + s_parameter_labels)

print(f"\nOriginal dataset shape: {df.shape}")
print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} rows with null values")

In [None]:
# Step 5: Create separate dataframes for features and labels
X = df_clean[feature_columns].copy()
Y = df_clean[s_parameter_labels].copy()

# Print shapes to confirm
print(f"\nFeature dataset shape: {X.shape}")
print(f"S-parameter labels shape: {Y.shape}")

# Step 6: Basic statistics for all datasets
print("\nFeature statistics (first 5 columns):")
print(X.iloc[:, :5].describe())


print("\nS-parameter statistics (first 4 columns):")
print(Y.iloc[:, :4].describe())

# Optional: Save cleaned datasets to files
# X.to_csv("hbt_features.csv", index=False)
# Y.to_csv("hbt_sparam_labels.csv", index=False)

print("\nFeature and label separation complete!")

In [None]:
def plot_feature_vs_label_correlations(X, y, target_names, filename):
    """Create a heatmap of correlations between features and labels"""
    # Calculate correlations
    combined = pd.concat([X, y], axis=1)
    correlation = combined.corr()

    # Extract only the correlations between features and labels
    feature_target_corr = correlation.loc[X.columns, target_names]

    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(feature_target_corr, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Feature-Target Correlations")
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

    return feature_target_corr

In [None]:
# Plot correlations for selected S-parameters (using just S11 as example)
s11_labels = ["S_deemb(1,1)_real", "S_deemb(1,1)_imag"]
s11_corr = plot_feature_vs_label_correlations(
    X, Y[s11_labels], s11_labels, "s11_correlations.png"
)
print("\nTop 5 features correlated with S11 parameters:")
for label in s11_labels:
    top_features = s11_corr[label].abs().sort_values(ascending=False).head(5)
    print(f"\nTop features for {label}:")
    print(top_features)

In [None]:
def create_frequency_based_split(df, test_size=0.2, random_state=42):
    """
    Create a train-test split where:
    1. No two consecutive frequency values are in the test set
    2. Test set frequencies are evenly distributed across frequency bands

    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing a 'freq' column
    test_size : float, default=0.2
        Proportion of unique frequency values to include in test set
    random_state : int, default=42
        Random seed for reproducibility

    Returns:
    --------
    train_mask : numpy array
        Boolean mask for training data
    test_mask : numpy array
        Boolean mask for test data
    """
    # Set random seed for reproducibility
    np.random.seed(random_state)

    # Get sorted unique frequency values
    unique_freqs = np.sort(df["freq"].unique())
    n_freqs = len(unique_freqs)
    print(f"Found {n_freqs} unique frequency values")

    # Define band boundaries
    band_boundaries = [
        (0, 1e9),  # Band 1: < 1 GHz
        (1e9, 6e9),  # Band 2: 1-6 GHz
        (6e9, 20e9),  # Band 3: 6-20 GHz
        (20e9, 40e9),  # Band 4: 20-40 GHz
        (40e9, float("inf")),  # Band 5: > 40 GHz
    ]

    # Assign frequencies to bands
    freq_bands = np.zeros(n_freqs, dtype=int)

    for i, freq in enumerate(unique_freqs):
        for band_idx, (lower, upper) in enumerate(band_boundaries):
            if lower <= freq < upper or (band_idx == 4 and freq >= lower):
                freq_bands[i] = band_idx
                break

    # Count frequencies in each band
    band_counts = np.zeros(5, dtype=int)
    for band in freq_bands:
        band_counts[band] += 1

    for band_idx, count in enumerate(band_counts):
        print(f"Band {band_idx + 1}: {count} frequency values")

    # Simple but effective approach: select every k-th frequency as test set
    # This guarantees no consecutive frequencies in test set
    k = int(1 / test_size)  # If test_size is 0.2, k=5 means select every 5th frequency

    # Start with a base selection
    test_indices = np.arange(0, n_freqs, k)
    print(f"Base selection gives {len(test_indices)} test frequencies (every {k}th)")

    # Calculate target test frequencies per band
    target_per_band = np.zeros(5, dtype=int)
    for i, count in enumerate(band_counts):
        target_per_band[i] = max(1, int(round(count * test_size)))

    print("Target test frequencies per band:")
    for i, target in enumerate(target_per_band):
        print(f"Band {i + 1}: {target}")

    # Calculate how many frequencies we actually selected per band
    actual_per_band = np.zeros(5, dtype=int)
    for idx in test_indices:
        band = freq_bands[idx]
        actual_per_band[band] += 1

    print("Actual initial test frequencies per band:")
    for i, actual in enumerate(actual_per_band):
        print(f"Band {i + 1}: {actual}")

    # Adjust selection to better match target distribution
    # First, identify bands that need more frequencies
    for band in range(5):
        if actual_per_band[band] < target_per_band[band]:
            # Get candidate indices in this band that aren't already selected
            band_candidates = [
                i
                for i in range(n_freqs)
                if freq_bands[i] == band
                and i not in test_indices
                and i - 1 not in test_indices
                and i + 1 not in test_indices
            ]

            # How many more do we need?
            n_needed = target_per_band[band] - actual_per_band[band]

            # Select additional frequencies if we have enough candidates
            if len(band_candidates) >= n_needed:
                # Choose candidates with roughly equal spacing
                step = max(1, len(band_candidates) // n_needed)
                selected = band_candidates[::step][:n_needed]
                test_indices = np.append(test_indices, selected)
                actual_per_band[band] += len(selected)

    # If we over-selected in some bands, remove frequencies to match target
    for band in range(5):
        if actual_per_band[band] > target_per_band[band]:
            # How many to remove
            n_remove = actual_per_band[band] - target_per_band[band]

            # Get indices in this band that were selected
            band_selected = [i for i in test_indices if freq_bands[i] == band]

            # Choose which ones to remove (spaced out)
            if band_selected:
                step = max(1, len(band_selected) // n_remove)
                to_remove = band_selected[::step][:n_remove]
                test_indices = np.array([i for i in test_indices if i not in to_remove])
                actual_per_band[band] -= len(to_remove)

    print("Final test frequencies per band after adjustment:")
    for i, actual in enumerate(actual_per_band):
        print(f"Band {i + 1}: {actual} (target: {target_per_band[i]})")

    # Sort the indices
    test_indices = np.sort(test_indices)

    # Verify that no consecutive frequencies are in test set
    for i in range(len(test_indices) - 1):
        if test_indices[i + 1] - test_indices[i] == 1:
            print(
                f"WARNING: Consecutive frequencies in test set: {unique_freqs[test_indices[i]]} and {unique_freqs[test_indices[i + 1]]}"
            )

    # Create test frequencies set
    test_freqs = unique_freqs[test_indices]

    # Create train and test masks
    test_mask = df["freq"].isin(test_freqs)
    train_mask = ~test_mask

    print(f"Final training set: {train_mask.sum()} samples")
    print(f"Final test set: {test_mask.sum()} samples")

    return train_mask, test_mask

In [None]:
def create_improved_frequency_split(df, test_size=0.2, random_state=42):
    """
    Create an improved frequency-based train-test split that ensures:
    1. No frequency overlap between train and test
    2. Even distribution of device parameters between train and test
    3. Balanced representation of different device geometries and parameters

    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing dataset with 'freq' and device parameters
    test_size : float, default=0.2
        Proportion of unique frequency values to include in test set
    random_state : int, default=42
        Random seed for reproducibility

    Returns:
    --------
    train_mask, test_mask : numpy arrays
        Boolean masks for train and test data
    """
    # Set random seed
    np.random.seed(random_state)

    # Get sorted unique frequency values
    unique_freqs = np.sort(df["freq"].unique())
    n_freqs = len(unique_freqs)

    # Define frequency bands
    band_boundaries = [
        (0, 1e9),  # < 1 GHz
        (1e9, 6e9),  # 1-6 GHz
        (6e9, 20e9),  # 6-20 GHz
        (20e9, 40e9),  # 20-40 GHz
        (40e9, float("inf")),  # > 40 GHz
    ]

    # Assign frequencies to bands
    freq_bands = {}
    band_freqs = {i: [] for i in range(len(band_boundaries))}

    for freq in unique_freqs:
        for band_idx, (lower, upper) in enumerate(band_boundaries):
            if lower <= freq < upper or (
                band_idx == len(band_boundaries) - 1 and freq >= lower
            ):
                freq_bands[freq] = band_idx
                band_freqs[band_idx].append(freq)
                break

    # Select test frequencies ensuring no consecutive frequencies
    test_freqs = []

    # Calculate target number of test frequencies per band
    target_per_band = {
        band: max(1, int(len(freqs) * test_size)) for band, freqs in band_freqs.items()
    }

    # Randomly select frequencies from each band
    for band, freqs in band_freqs.items():
        if len(freqs) > 0:
            # Sort frequencies within band
            sorted_freqs = np.sort(freqs)

            # Select frequencies with spacing to avoid consecutive selections
            n_select = target_per_band[band]
            step = max(1, len(sorted_freqs) // (n_select + 1))

            # Jitter indices to avoid selecting frequencies at exact intervals
            indices = np.arange(step, len(sorted_freqs), step)[:n_select]
            indices = np.clip(
                indices + np.random.randint(-step // 4, step // 4, size=len(indices)),
                0,
                len(sorted_freqs) - 1,
            )

            # Ensure unique indices
            indices = np.unique(indices)
            selected_freqs = sorted_freqs[indices]

            test_freqs.extend(selected_freqs)

    # Create train and test masks
    test_mask = df["freq"].isin(test_freqs)
    train_mask = ~test_mask

    # Check for balanced distribution of device parameters
    dev_params = ["DEV_GEOM_L", "NUM_OF_TRANS_RF", "vb", "vc"]
    for param in dev_params:
        if param in df.columns:
            train_dist = df.loc[train_mask, param].value_counts(normalize=True)
            test_dist = df.loc[test_mask, param].value_counts(normalize=True)

            # If distributions are very different, adjust selection
            if np.abs(train_dist.values - test_dist.values).max() > 0.2:
                print(
                    f"Warning: Unbalanced distribution detected for {param}. Adjusting split..."
                )
                # This could be expanded with a rebalancing algorithm

    # Verify no frequency overlap
    train_freqs = df.loc[train_mask, "freq"].unique()
    test_freqs = df.loc[test_mask, "freq"].unique()
    overlap = np.intersect1d(train_freqs, test_freqs)
    assert len(overlap) == 0, "Frequency overlap detected in split!"

    print(f"Train set: {train_mask.sum()} samples ({train_mask.mean():.2%})")
    print(f"Test set: {test_mask.sum()} samples ({test_mask.mean():.2%})")

    return train_mask, test_mask

In [None]:
# Replace your current train-test split with the frequency-based approach
train_mask, test_mask = create_frequency_based_split(
    df_clean, test_size=0.2, random_state=42
)

# Use the masks to split features and labels
X_raw_train = X[train_mask].copy()
X_raw_test = X[test_mask].copy()
Y_raw_train = Y[train_mask].copy()
Y_raw_test = Y[test_mask].copy()

In [None]:
# For training data
X_train = X_raw_train.copy()
X_train["vb_is_zero"] = (X_train["vb"] == 0).astype(int)
X_train["vb_is_high"] = ((X_train["vb"] >= 0.7) & (X_train["vb"] <= 0.9)).astype(int)
X_train["vc_is_zero"] = (X_train["vc"] == 0).astype(int)
X_train["vc_is_1_2V"] = ((X_train["vc"] >= 1.1) & (X_train["vc"] <= 1.3)).astype(int)
X_train["vc_is_1_5V"] = ((X_train["vc"] >= 1.4) & (X_train["vc"] <= 1.6)).astype(int)

# For test data
X_test = X_raw_test.copy()
X_test["vb_is_zero"] = (X_test["vb"] == 0).astype(int)
X_test["vb_is_high"] = ((X_test["vb"] >= 0.7) & (X_test["vb"] <= 0.9)).astype(int)
X_test["vc_is_zero"] = (X_test["vc"] == 0).astype(int)
X_test["vc_is_1_2V"] = ((X_test["vc"] >= 1.1) & (X_test["vc"] <= 1.3)).astype(int)
X_test["vc_is_1_5V"] = ((X_test["vc"] >= 1.4) & (X_test["vc"] <= 1.6)).astype(int)

# STEP 3: Initialize and fit scaler ONLY on training data
voltage_scaler = MinMaxScaler(feature_range=(-1, 1))
voltage_scaler.fit(X_train[["vb", "vc"]])  # Fit only on training data

# STEP 4: Transform both datasets using the fitted scaler
X_train[["vb", "vc"]] = voltage_scaler.transform(X_train[["vb", "vc"]])
X_test[["vb", "vc"]] = voltage_scaler.transform(X_test[["vb", "vc"]])

# STEP 5: Save the scaler for future use
import joblib

joblib.dump(voltage_scaler, "voltage_scaler.pkl")

In [None]:
# Process training data
X_train = X_raw_train.copy()
X_train.loc[:, "DEV_L_0_9um"] = (X_train["DEV_GEOM_L"] == 0.9).astype(int)
X_train.loc[:, "DEV_L_2_5um"] = (X_train["DEV_GEOM_L"] == 2.5).astype(int)
X_train.loc[:, "DEV_L_5_0um"] = (X_train["DEV_GEOM_L"] == 5.0).astype(int)

# Drop the original column from training data
X_train = X_train.drop("DEV_GEOM_L", axis=1)

# Process test data with the same transformations
X_test = X_raw_test.copy()
X_test.loc[:, "DEV_L_0_9um"] = (X_test["DEV_GEOM_L"] == 0.9).astype(int)
X_test.loc[:, "DEV_L_2_5um"] = (X_test["DEV_GEOM_L"] == 2.5).astype(int)
X_test.loc[:, "DEV_L_5_0um"] = (X_test["DEV_GEOM_L"] == 5.0).astype(int)

# Drop the original column from test data
X_test = X_test.drop("DEV_GEOM_L", axis=1)

In [None]:
# STEP 2: Process training data
X_train = X_raw_train.copy()
X_train.loc[:, "TRANS_1"] = (X_train["NUM_OF_TRANS_RF"] == 1).astype(int)
X_train.loc[:, "TRANS_2"] = (X_train["NUM_OF_TRANS_RF"] == 2).astype(int)
X_train.loc[:, "TRANS_4"] = (X_train["NUM_OF_TRANS_RF"] == 4).astype(int)

# Drop the original column from training data
X_train = X_train.drop("NUM_OF_TRANS_RF", axis=1)

# STEP 3: Process test data with the same transformations
X_test = X_raw_test.copy()
X_test.loc[:, "TRANS_1"] = (X_test["NUM_OF_TRANS_RF"] == 1).astype(int)
X_test.loc[:, "TRANS_2"] = (X_test["NUM_OF_TRANS_RF"] == 2).astype(int)
X_test.loc[:, "TRANS_4"] = (X_test["NUM_OF_TRANS_RF"] == 4).astype(int)

# Drop the original column from test data
X_test = X_test.drop("NUM_OF_TRANS_RF", axis=1)

In [None]:
# STEP 2: Process training data
X_train = X_raw_train.copy()

# Create binary features for physically questionable negative values
X_train.loc[:, "gm_is_neg"] = (X_train["gm"] < 0).astype(int)
X_train.loc[:, "Cpi_is_neg"] = (X_train["Cpi"] < 0).astype(int)
X_train.loc[:, "Cmu_is_neg"] = (X_train["Cmu"] < 0).astype(int)

# For highly concentrated distributions, use log transform on absolute values
epsilon = 1e-20  # Small value to prevent log(0)
X_train.loc[:, "gm_abs_log"] = np.log10(np.abs(X_train["gm"]) + epsilon)
X_train.loc[:, "Cpi_abs_log"] = np.log10(np.abs(X_train["Cpi"]) + epsilon)
X_train.loc[:, "Cmu_abs_log"] = np.log10(np.abs(X_train["Cmu"]) + epsilon)

# STEP 3: Fit the scaler ONLY on training data
robust_scaler = RobustScaler()
robust_scaler.fit(X_train[["gm_abs_log", "Cpi_abs_log", "Cmu_abs_log"]])

# Apply the fitted scaler to training data
X_train.loc[:, ["gm_abs_log", "Cpi_abs_log", "Cmu_abs_log"]] = robust_scaler.transform(
    X_train[["gm_abs_log", "Cpi_abs_log", "Cmu_abs_log"]]
)

# Drop original columns from training data
X_train = X_train.drop(["gm", "Cpi", "Cmu"], axis=1)

# STEP 4: Process test data using same transformations
X_test = X_raw_test.copy()

# Create binary features for physically questionable negative values
X_test.loc[:, "gm_is_neg"] = (X_test["gm"] < 0).astype(int)
X_test.loc[:, "Cpi_is_neg"] = (X_test["Cpi"] < 0).astype(int)
X_test.loc[:, "Cmu_is_neg"] = (X_test["Cmu"] < 0).astype(int)

# Apply same log transform with same epsilon
X_test.loc[:, "gm_abs_log"] = np.log10(np.abs(X_test["gm"]) + epsilon)
X_test.loc[:, "Cpi_abs_log"] = np.log10(np.abs(X_test["Cpi"]) + epsilon)
X_test.loc[:, "Cmu_abs_log"] = np.log10(np.abs(X_test["Cmu"]) + epsilon)

# Apply the previously fitted scaler to test data
X_test.loc[:, ["gm_abs_log", "Cpi_abs_log", "Cmu_abs_log"]] = robust_scaler.transform(
    X_test[["gm_abs_log", "Cpi_abs_log", "Cmu_abs_log"]]
)

# Drop original columns from test data
X_test = X_test.drop(["gm", "Cpi", "Cmu"], axis=1)

# STEP 5: Save the scaler for future use
import joblib

joblib.dump(robust_scaler, "device_params_robust_scaler.pkl")

In [None]:
# Define transformation function
def signed_log_transform(x, epsilon=1e-20):
    """Apply log transform that preserves sign of original value"""
    return np.sign(x) * np.log1p(np.abs(x) + epsilon)


# STEP 2: Process training data
X_train = X_raw_train.copy()

# Create binary indicators for physically significant states
X_train["Zin_real_negative"] = (X_train["Zin_real"] < 0).astype(int)
X_train["Zout_real_negative"] = (X_train["Zout_real"] < 0).astype(int)

# Apply signed log transformation to handle extreme values while preserving sign
for col in ["Zin_real", "Zin_imag", "Zout_real"]:
    X_train[f"{col}_log"] = signed_log_transform(X_train[col])

# Initialize and fit scaler ONLY on training data
impedance_scaler = RobustScaler()
log_cols = [col + "_log" for col in ["Zin_real", "Zin_imag", "Zout_real"]]
impedance_scaler.fit(X_train[log_cols])

# Apply the fitted scaler to training data
X_train[log_cols] = impedance_scaler.transform(X_train[log_cols])

# Create interaction feature
X_train["Zin_real_imag_interaction"] = X_train["Zin_real_log"] * X_train["Zin_imag_log"]

# Drop original features from training data
X_train = X_train.drop(["Zin_real", "Zin_imag", "Zout_real"], axis=1)

# STEP 3: Process test data using the same transformations
X_test = X_raw_test.copy()

# Create binary indicators for test data
X_test["Zin_real_negative"] = (X_test["Zin_real"] < 0).astype(int)
X_test["Zout_real_negative"] = (X_test["Zout_real"] < 0).astype(int)

# Apply same log transformation with same epsilon
for col in ["Zin_real", "Zin_imag", "Zout_real"]:
    X_test[f"{col}_log"] = signed_log_transform(X_test[col])

# Apply the PREVIOUSLY FITTED scaler to test data
X_test[log_cols] = impedance_scaler.transform(X_test[log_cols])

# Create interaction feature for test data
X_test["Zin_real_imag_interaction"] = X_test["Zin_real_log"] * X_test["Zin_imag_log"]

# Drop original features from test data
X_test = X_test.drop(["Zin_real", "Zin_imag", "Zout_real"], axis=1)

# STEP 4: Save the scaler for future use
import joblib

joblib.dump(impedance_scaler, "impedance_scaler.pkl")

In [None]:
X_train.columns

In [None]:
import importlib

import frequency_preprocessing

importlib.reload(frequency_preprocessing)
from frequency_preprocessing import preprocess_frequency

# Then try using it
X_train, X_test = preprocess_frequency(X_train, X_test, fit_mode=True)

In [None]:
# Fill NaN values with 0 for freq_pos_in_band columns
for i in range(1, 6):
    X_train[f"freq_pos_in_band_{i}"] = X_train[f"freq_pos_in_band_{i}"].fillna(0)
    if X_test is not None:
        X_test[f"freq_pos_in_band_{i}"] = X_test[f"freq_pos_in_band_{i}"].fillna(0)

# Fill any remaining NaN values in other columns
X_train = X_train.fillna(0)
if X_test is not None:
    X_test = X_test.fillna(0)

In [None]:
X_train.columns.size


In [None]:
# Define the 4 S-parameter pairs (each will be a separate model)
s_parameter_models = {
    "S11": ["S_deemb(1,1)_real", "S_deemb(1,1)_imag"],
    "S12": ["S_deemb(1,2)_real", "S_deemb(1,2)_imag"],
    "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
    "S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"],
}

# Dictionary to store best features for each model
best_features = {}
importances_by_model = {}

# Analyze each S-parameter model independently
for model_name, components in s_parameter_models.items():
    print(f"\n{'=' * 50}")
    print(f"Analyzing feature importance for {model_name} model")
    print(f"{'=' * 50}")

    # Get targets for this S-parameter model (both real and imaginary)
    Y_model = Y_raw_train[components]

    # Train a model for each component to get feature importance
    importances = {}

    # Calculate importance scores for both components and combine them
    for component in components:
        print(f"Training model for {component}...")

        # Train RF model for this component
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_train, Y_raw_train[component])

        # Get feature importances
        for feature, importance in zip(X_train.columns, rf.feature_importances_):
            if feature in importances:
                # Average importance across components
                importances[feature] = (importances[feature] + importance) / 2
            else:
                importances[feature] = importance

    # Convert to DataFrame and sort
    importance_df = pd.DataFrame(
        {"feature": list(importances.keys()), "importance": list(importances.values())}
    ).sort_values("importance", ascending=False)

    # Store the importance dataframe
    importances_by_model[model_name] = importance_df

    # Print top 30 features
    print(f"\nTop 30 Features for {model_name}:")
    print(importance_df.head(30))

    # Plot feature importance
    plt.figure(figsize=(12, 10))
    plt.title(f"Top 25 Features for {model_name} Model")
    sns.barplot(x="importance", y="feature", data=importance_df.head(25))
    plt.tight_layout()
    plt.savefig(f"feature_importance_{model_name}.png")
    plt.close()

    # Plot features by category
    # Create a mapping of features to categories
    feature_categories = {}
    for feature in X_train.columns:
        if "freq_band_" in feature or "band_" in feature:
            feature_categories[feature] = "Frequency Band"
        elif "freq_" in feature:
            feature_categories[feature] = "Frequency Feature"
        elif feature in ["vb", "vc"]:
            feature_categories[feature] = "Voltage"
        elif feature in ["gm", "Cpi", "Cmu"] or any(
            x in feature for x in ["gm_", "Cpi_", "Cmu_"]
        ):
            feature_categories[feature] = "Device Parameter"
        elif "Zin_" in feature or "Zout_" in feature:
            feature_categories[feature] = "Impedance"
        elif "DEV_" in feature or "TRANS_" in feature:
            feature_categories[feature] = "Device Geometry"
        else:
            feature_categories[feature] = "Other"

    # Add category column
    importance_df["category"] = importance_df["feature"].map(feature_categories)

    # Plot importance by category
    plt.figure(figsize=(15, 12))
    plt.suptitle(f"Feature Importance by Category for {model_name}", fontsize=16)

    for i, cat in enumerate(sorted(importance_df["category"].unique())):
        # Get features in this category
        cat_features = importance_df[importance_df["category"] == cat]
        if len(cat_features) > 0:
            plt.subplot(len(importance_df["category"].unique()), 1, i + 1)
            sns.barplot(x="importance", y="feature", data=cat_features.head(10))
            plt.title(f"{cat} Features")

    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for the suptitle
    plt.savefig(f"feature_importance_by_category_{model_name}.png")
    plt.close()

    # Evaluate baseline performance with all features
    def evaluate_model(features, y_true_components):
        # Create model
        base_model = RandomForestRegressor(n_estimators=75, random_state=42, n_jobs=-1)
        model = MultiOutputRegressor(base_model)

        # Train on specified features
        model.fit(X_train[features], Y_raw_train[y_true_components])

        # Predict
        preds = model.predict(X_test[features])

        # Calculate R² for each component
        r2_scores = [
            r2_score(Y_raw_test[comp], preds[:, i])
            for i, comp in enumerate(y_true_components)
        ]

        return np.mean(r2_scores), r2_scores

    # Evaluate performance with different feature sets
    full_perf, full_component_perf = evaluate_model(X_train.columns, components)
    print(f"\nFull model performance (all {len(X_train.columns)} features):")
    print(f"  Average R²: {full_perf:.4f}")
    print(f"  {components[0]} R²: {full_component_perf[0]:.4f}")
    print(f"  {components[1]} R²: {full_component_perf[1]:.4f}")

    # Define different feature set sizes to test
    feature_counts = [10, 15, 20, 25, 30, 38]
    results = []

    # Test performance with different feature set sizes
    for n_features in feature_counts:
        # Get top N features
        top_features = importance_df.head(n_features)["feature"].tolist()

        # Evaluate
        avg_perf, component_perf = evaluate_model(top_features, components)

        # Save results
        results.append(
            {
                "n_features": n_features,
                "avg_r2": avg_perf,
                "real_r2": component_perf[0],
                "imag_r2": component_perf[1],
                "vs_full": avg_perf - full_perf,
            }
        )

    # Convert to DataFrame
    results_df = pd.DataFrame(results)

    # Print results
    print("\nPerformance with different feature set sizes:")
    print(results_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

    # Plot performance vs feature count
    plt.figure(figsize=(10, 6))
    plt.plot(results_df["n_features"], results_df["avg_r2"], "o-", label="Average R²")
    plt.plot(
        results_df["n_features"],
        results_df["real_r2"],
        "s-",
        label=f"{components[0]} R²",
    )
    plt.plot(
        results_df["n_features"],
        results_df["imag_r2"],
        "^-",
        label=f"{components[1]} R²",
    )
    plt.axhline(
        y=full_perf,
        color="r",
        linestyle="--",
        label=f"Full model ({len(X_train.columns)} features)",
    )
    plt.xlabel("Number of Features")
    plt.ylabel("R² Score")
    plt.title(f"Model Performance vs Feature Count for {model_name}")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"performance_vs_features_{model_name}.png")
    plt.close()

    # Find optimal feature set
    # Use the smallest feature set that achieves at least 99% of full model performance
    target_performance = full_perf * 0.99
    optimal_row = results_df[results_df["avg_r2"] >= target_performance].iloc[0]
    optimal_feature_count = int(optimal_row["n_features"])

    print(f"\nRecommended feature set for {model_name}:")
    print(f"  Use top {optimal_feature_count} features")
    print(
        f"  Expected performance: {optimal_row['avg_r2']:.4f} R² "
        + f"({optimal_row['avg_r2'] / full_perf * 100:.1f}% of full model)"
    )

    # Store the best features
    best_features[model_name] = importance_df.head(optimal_feature_count)[
        "feature"
    ].tolist()

    # Print the specific features
    print("\nFeatures to use:")
    for i, feature in enumerate(best_features[model_name]):
        print(f"  {i + 1}. {feature}")

# Create a comparison of feature sets
comparison = pd.DataFrame(index=set().union(*best_features.values()))

# For each model, mark which features are used
for model_name, features in best_features.items():
    comparison[model_name] = [1 if f in features else 0 for f in comparison.index]

# Add a "used in X models" column
comparison["used_in_models"] = comparison.sum(axis=1)
comparison = comparison.sort_values("used_in_models", ascending=False)

print("\nFeature usage across models:")
print(comparison)

# Create a visualization of feature overlap
plt.figure(figsize=(14, 12))
plt.title("Feature Usage Across S-Parameter Models")
sns.heatmap(
    comparison.drop("used_in_models", axis=1), cmap="YlGnBu", annot=True, cbar=False
)
plt.tight_layout()
plt.savefig("feature_usage_across_models.png")
plt.close()

# Print final summary of recommended feature sets
print("\n" + "=" * 70)
print("FINAL RECOMMENDED FEATURE SETS")
print("=" * 70)

for model_name in s_parameter_models.keys():
    print(f"\n{model_name} Model: {len(best_features[model_name])} features")
    for feature in best_features[model_name]:
        print(f"  - {feature}")

In [None]:
def mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    """
    Calculate Mean Absolute Percentage Error with protection against division by zero.

    Parameters:
    -----------
    y_true : array-like
        Actual target values
    y_pred : array-like
        Predicted target values
    epsilon : float, default=1e-10
        Small constant to avoid division by zero

    Returns:
    --------
    mape : float
        Mean Absolute Percentage Error value
    """
    # Handle cases where y_true is close to zero
    # We add epsilon to denominator to avoid division by zero
    non_zero = np.abs(y_true) > epsilon

    if non_zero.sum() == 0:
        return np.nan  # Return NaN if all values are too close to zero

    # Calculate percentage errors only for non-zero values
    percentage_errors = (
        np.abs(
            (y_true[non_zero] - y_pred[non_zero]) / (np.abs(y_true[non_zero]) + epsilon)
        )
        * 100
    )

    # Return the mean
    return np.mean(percentage_errors)


def train_evaluate_linear_regression_baseline(
    X_train, X_test, Y_train, Y_test, s_parameter_models
):
    """
    Train and evaluate linear regression models for each S-parameter component.
    Includes RMSE, R², MAE, and MAPE metrics.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Preprocessed feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    s_parameter_models : dict
        Dictionary mapping S-parameter names to their component columns

    Returns:
    --------
    results : dict
        Dictionary with model performance metrics
    models : dict
        Dictionary with trained linear regression models
    """
    results = {}
    models = {}

    # For comparison with your RF models
    start_time = time.time()

    # Summary tables for each metric
    summary_r2 = {}
    summary_rmse = {}
    summary_mae = {}
    summary_mape = {}

    for model_name, components in s_parameter_models.items():
        print(f"\n{'=' * 50}")
        print(f"Training linear regression for {model_name}")
        print(f"{'=' * 50}")

        # Train separate models for real and imaginary components
        model_results = {}
        model_dict = {}

        # Store metrics for this S-parameter
        r2_values = []
        rmse_values = []
        mae_values = []
        mape_values = []

        for component in components:
            print(f"Training model for {component}...")

            # Create and train linear model
            lr_model = LinearRegression()
            lr_model.fit(X_train, Y_train[component])

            # Make predictions
            y_pred = lr_model.predict(X_test)

            # Calculate metrics
            mse = mean_squared_error(Y_test[component], y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(Y_test[component], y_pred)
            mae = mean_absolute_error(Y_test[component], y_pred)
            mape = mean_absolute_percentage_error(Y_test[component].values, y_pred)

            print(f"  RMSE: {rmse:.6f}")
            print(f"  R²: {r2:.6f}")
            print(f"  MAE: {mae:.6f}")
            print(f"  MAPE: {mape:.2f}%")

            # Store results
            model_results[component] = {
                "rmse": rmse,
                "r2": r2,
                "mae": mae,
                "mape": mape,
            }
            model_dict[component] = lr_model

            # Store for averaging
            r2_values.append(r2)
            rmse_values.append(rmse)
            mae_values.append(mae)
            mape_values.append(mape)

            # Visualize predictions vs actual
            plt.figure(figsize=(10, 6))
            plt.scatter(Y_test[component], y_pred, alpha=0.3)
            plt.plot(
                [Y_test[component].min(), Y_test[component].max()],
                [Y_test[component].min(), Y_test[component].max()],
                "r--",
            )
            plt.xlabel("Actual")
            plt.ylabel("Predicted")
            plt.title(f"Linear Regression: {component} Predictions vs Actual")
            plt.savefig(f"linear_regression_{component}_pred_vs_actual.png")
            plt.close()

        # Calculate average metrics for this S-parameter
        avg_r2 = np.mean(r2_values)
        avg_rmse = np.mean(rmse_values)
        avg_mae = np.mean(mae_values)
        avg_mape = np.mean(mape_values)

        print(f"\nAverage metrics for {model_name}:")
        print(f"  R²: {avg_r2:.6f}")
        print(f"  RMSE: {avg_rmse:.6f}")
        print(f"  MAE: {avg_mae:.6f}")
        print(f"  MAPE: {avg_mape:.2f}%")

        # Store for summary
        summary_r2[model_name] = avg_r2
        summary_rmse[model_name] = avg_rmse
        summary_mae[model_name] = avg_mae
        summary_mape[model_name] = avg_mape

        # Store model and results
        results[model_name] = model_results
        models[model_name] = model_dict

    train_time = time.time() - start_time
    print(f"\nTotal training time: {train_time:.2f} seconds")

    # Print detailed summary table
    print("\nDetailed Performance Summary:")
    print(
        f"{'S-Parameter':<10} {'Component':<20} {'RMSE':<10} {'R²':<10} {'MAE':<10} {'MAPE':<10}"
    )
    print("-" * 70)

    for model_name, model_results in results.items():
        for component, metrics in model_results.items():
            print(
                f"{model_name:<10} {component:<20} {metrics['rmse']:<10.6f} {metrics['r2']:<10.6f} {metrics['mae']:<10.6f} {metrics['mape']:<10.2f}%"
            )

    # Print overall summary table
    print("\nOverall S-Parameter Performance:")
    print(f"{'S-Parameter':<10} {'R²':<10} {'RMSE':<10} {'MAE':<10} {'MAPE':<10}")
    print("-" * 55)

    for model_name in s_parameter_models.keys():
        print(
            f"{model_name:<10} {summary_r2[model_name]:<10.6f} {summary_rmse[model_name]:<10.6f} {summary_mae[model_name]:<10.6f} {summary_mape[model_name]:<10.2f}%"
        )

    # Calculate and print overall average
    overall_r2 = np.mean(list(summary_r2.values()))
    overall_rmse = np.mean(list(summary_rmse.values()))
    overall_mae = np.mean(list(summary_mae.values()))
    overall_mape = np.mean(list(summary_mape.values()))

    print("-" * 55)
    print(
        f"{'AVERAGE':<10} {overall_r2:<10.6f} {overall_rmse:<10.6f} {overall_mae:<10.6f} {overall_mape:<10.2f}%"
    )

    return results, models


# Feature importance analysis function remains unchanged
def analyze_linear_regression_coefficients(models, X_train, s_parameter_models):
    """
    Analyze and visualize the coefficients of linear regression models.

    Parameters:
    -----------
    models : dict
        Dictionary with trained linear regression models
    X_train : pd.DataFrame
        Preprocessed feature dataset to get column names
    s_parameter_models : dict
        Dictionary mapping S-parameter names to their component columns
    """
    feature_names = X_train.columns

    for model_name, components in s_parameter_models.items():
        print(f"\n{'=' * 50}")
        print(f"Linear Regression Coefficients for {model_name}")
        print(f"{'=' * 50}")

        # Create a DataFrame to store coefficients for both components
        coef_df = pd.DataFrame(index=feature_names)

        for component in components:
            model = models[model_name][component]
            coef_df[component] = model.coef_

        # Add absolute importance (average of absolute coefficients)
        coef_df["abs_importance"] = coef_df.abs().mean(axis=1)

        # Sort by absolute importance
        coef_df = coef_df.sort_values("abs_importance", ascending=False)

        # Print top 20 most important features
        print(f"\nTop 20 features for {model_name}:")
        print(coef_df.head(20))

        # Visualize top 15 coefficients
        plt.figure(figsize=(12, 10))
        top_features = coef_df.head(15).index

        for i, component in enumerate(components):
            plt.subplot(len(components), 1, i + 1)
            plt.barh(top_features, coef_df.loc[top_features, component])
            plt.title(f"{component} Coefficients")
            plt.xlabel("Coefficient Value")
            plt.tight_layout()

        plt.savefig(f"linear_regression_{model_name}_coefficients.png")
        plt.close()


# Usage example
s_parameter_models = {
    "S11": ["S_deemb(1,1)_real", "S_deemb(1,1)_imag"],
    "S12": ["S_deemb(1,2)_real", "S_deemb(1,2)_imag"],
    "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
    "S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"],
}

# Train and evaluate models
results, models = train_evaluate_linear_regression_baseline(
    X_train, X_test, Y_raw_train, Y_raw_test, s_parameter_models
)

# Analyze coefficients
analyze_linear_regression_coefficients(models, X_train, s_parameter_models)

In [None]:
X_train.isnull().sum()

In [None]:
# Train models with default hyperparameters
models, results, predictions = 
(
    X_train, X_test, Y_raw_train, Y_raw_test
)

In [None]:
# Define parameter grid
param_grid = {
    "learning_rate": [
        0.002,
    ],
    "dropout_rate": [0.1],
    "batch_size": [512],
    "epochs": [150],
    "early_stopping_patience": [20],
    "hidden_sizes": [[256, 512, 1024, 512]],
    "lr_scheduler_type": ["one_cycle"],
    "activation": ["gelu"],
}

# Run tuning experiment
tuning_results = hyperparameter_tuning(
    X_train, X_test, Y_raw_train, Y_raw_test, param_grid
)

In [None]:
param_grid = {
    "learning_rate": [0.002],
    "dropout_rate": [0.1],
    "batch_size": [512],
    "epochs": [200],
    "early_stopping_patience": [30],
    "hidden_sizes": [[256, 512, 1024, 512]],
    "lr_scheduler_type": ["one_cycle"],
    "activation": ["gelu"],
}

# Run tuning experiment
tuning_results = hyperparameter_tuning(
    X_train, X_test, Y_raw_train, Y_raw_test, param_grid
)

In [None]:
param_grid = {
    "learning_rate": [0.002],
    "dropout_rate": [0.1],
    "batch_size": [512],
    "epochs": [300],
    "early_stopping_patience": [40],
    "hidden_sizes": [
        [64, 128, 256],
        [128, 256, 512],
        [256, 512, 1024],
        [256, 512, 1024, 512],
        [512, 1024, 2048, 1024],
        [1024, 2048, 4096, 2048],
        [256, 512, 1024, 512, 256],
        [512, 1024, 2048, 1024, 512],
        [1024, 2048, 4096, 2048, 1024],
        [256, 512, 1024, 512, 256, 128],
        [512, 1024, 2048, 1024, 512, 256],
        [1024, 2048, 4096, 2048, 1024, 512],
    ],
    "lr_scheduler_type": ["one_cycle"],
    "activation": ["gelu"],
}

# Run tuning experiment
tuning_results = hyperparameter_tuning(
    X_train, X_test, Y_raw_train, Y_raw_test, param_grid
)

In [None]:
# Data preprocessing check
print("S12 data statistics:")
print(
    "Mean:",
    Y_raw_train["S_deemb(1,2)_real"].mean(),
    Y_raw_train["S_deemb(1,2)_imag"].mean(),
)
print(
    "Std:",
    Y_raw_train["S_deemb(1,2)_real"].std(),
    Y_raw_train["S_deemb(1,2)_imag"].std(),
)
print(
    "Min:",
    Y_raw_train["S_deemb(1,2)_real"].min(),
    Y_raw_train["S_deemb(1,2)_imag"].min(),
)
print(
    "Max:",
    Y_raw_train["S_deemb(1,2)_real"].max(),
    Y_raw_train["S_deemb(1,2)_imag"].max(),
)

# Consider log scaling for S12 if values are very small
# Y_train_log = np.log10(np.abs(Y_train) + 1e-10) * np.sign(Y_train)

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from sklearn.discriminant_analysis import StandardScaler

# Create directory for results
os.makedirs("freq_aware_results", exist_ok=True)


# Define SMAPE function for better handling of small values
def symmetric_mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    """Calculate SMAPE with protection against division by zero."""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0 + epsilon
    numerator = np.abs(y_true - y_pred)
    smape = numerator / denominator
    return np.mean(smape) * 100


# Define mean absolute percentage error function
def mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    """Calculate MAPE with protection against division by zero."""
    non_zero = np.abs(y_true) > epsilon
    if non_zero.sum() == 0:
        return np.nan
    percentage_errors = (
        np.abs(
            (y_true[non_zero] - y_pred[non_zero]) / (np.abs(y_true[non_zero]) + epsilon)
        )
        * 100
    )
    return np.mean(percentage_errors)


# Frequency-aware neural network
class FrequencyAwareNetwork(nn.Module):
    def __init__(
        self,
        freq_features,
        other_features,
        hidden_sizes=[64, 128, 256],
        dropout_rate=0.2,
        activation="silu",
    ):
        super().__init__()

        if activation == "silu":
            activation_fn = nn.SiLU()
        elif activation == "relu":
            activation_fn = nn.ReLU()
        elif activation == "gelu":
            activation_fn = nn.GELU()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")

        # Frequency-specific processing branch
        freq_layers = []
        prev_size = freq_features
        for h_size in hidden_sizes[:2]:  # First two hidden sizes for branches
            freq_layers.append(nn.Linear(prev_size, h_size))
            freq_layers.append(
                activation_fn
            )  # Using SiLU (Swish) activation for better performance
            freq_layers.append(nn.BatchNorm1d(h_size))
            freq_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        self.freq_branch = nn.Sequential(*freq_layers)

        # Other parameters branch
        other_layers = []
        prev_size = other_features
        for h_size in hidden_sizes[:2]:
            other_layers.append(nn.Linear(prev_size, h_size))
            other_layers.append(activation_fn)
            other_layers.append(nn.BatchNorm1d(h_size))
            other_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        self.other_branch = nn.Sequential(*other_layers)

        # Combined processing with residual connections
        combined_layers = []
        prev_size = hidden_sizes[1] * 2  # Output size from both branches combined

        for h_size in hidden_sizes[2:]:
            combined_layers.append(nn.Linear(prev_size, h_size))
            combined_layers.append(activation_fn)
            combined_layers.append(nn.BatchNorm1d(h_size))
            combined_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        # Final output layer for real and imaginary components
        combined_layers.append(nn.Linear(prev_size, 2))

        self.combined = nn.Sequential(*combined_layers)

        # Store feature indices for processing
        self.freq_indices = None
        self.other_indices = None

    def forward(self, x):
        # Split input into frequency and other features
        if self.freq_indices is None or self.other_indices is None:
            raise ValueError(
                "Feature indices not set. Call set_feature_indices() first."
            )

        freq_input = x[:, self.freq_indices]
        other_input = x[:, self.other_indices]

        # Process through branches
        freq_features = self.freq_branch(freq_input)
        other_features = self.other_branch(other_input)

        # Combine and output
        combined = torch.cat([freq_features, other_features], dim=1)
        return self.combined(combined)

    def set_feature_indices(self, freq_indices, other_indices):
        """Set indices for frequency and other features."""
        self.freq_indices = freq_indices
        self.other_indices = other_indices


# Helper function to identify frequency-related features
def identify_frequency_features(X_columns):
    """Identify frequency-related features in the dataset."""
    freq_features = [
        i
        for i, col in enumerate(X_columns)
        if "freq" in col.lower() or "band" in col.lower()
    ]
    other_features = [i for i in range(len(X_columns)) if i not in freq_features]

    print(
        f"Identified {len(freq_features)} frequency-related features and {len(other_features)} other features"
    )
    return freq_features, other_features


# Modified prepare_data_for_pytorch to handle scaling
def prepare_data_for_pytorch_with_scaling(
    X_train, Y_train, X_test, Y_test, components, batch_size=128, scale_y=True
):
    """Prepare data for PyTorch models with optional Y-scaling."""

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train.values)
    X_test_tensor = torch.FloatTensor(X_test.values)

    # Handle Y data scaling if requested
    if scale_y:
        # Create scaler for Y values
        y_scaler = StandardScaler()
        Y_train_values = Y_train[components].values
        Y_test_values = Y_test[components].values

        # Fit scaler and transform data
        Y_train_scaled = y_scaler.fit_transform(Y_train_values)
        Y_test_scaled = y_scaler.transform(Y_test_values)

        # Convert to tensors
        Y_train_tensor = torch.FloatTensor(Y_train_scaled)
        Y_test_tensor = torch.FloatTensor(Y_test_scaled)

        # Save scaler for later use
        component_str = "_".join(components)
        joblib.dump(y_scaler, f"freq_aware_results/{component_str}_scaler.pkl")

        # Create data loaders
        train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        return (
            X_train_tensor,
            Y_train_tensor,
            X_test_tensor,
            Y_test_tensor,
            train_loader,
            y_scaler,
        )

    else:
        # No scaling
        Y_train_tensor = torch.FloatTensor(Y_train[components].values)
        Y_test_tensor = torch.FloatTensor(Y_test[components].values)

        # Create data loaders
        train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        return (
            X_train_tensor,
            Y_train_tensor,
            X_test_tensor,
            Y_test_tensor,
            train_loader,
            None,
        )


def train_model(
    model,
    train_loader,
    X_test_tensor,
    Y_test_tensor,
    criterion,
    optimizer,
    device,
    epochs=100,
    early_stopping_patience=15,
    verbose=True,
    lr_scheduler_type="reduce_on_plateau",
    warmup_epochs=5,
):
    """Train a PyTorch model with early stopping and learning rate scheduling."""
    model = model.to(device)

    # Set up learning rate scheduler based on specified type
    if lr_scheduler_type == "reduce_on_plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=0.85, patience=5, verbose=verbose, min_lr=5e-7
        )
    elif lr_scheduler_type == "cosine_annealing":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=epochs, eta_min=1e-6
        )
    elif lr_scheduler_type == "one_cycle":
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=optimizer.param_groups[0]["lr"],
            steps_per_epoch=len(train_loader),
            epochs=epochs,
        )
    else:
        scheduler = None

    # For early stopping
    best_loss = float("inf")
    best_model_state = None
    patience_counter = 0

    # Track losses and learning rates for plotting
    train_losses = []
    val_losses = []
    learning_rates = []

    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Apply learning rate warmup if needed
        if warmup_epochs > 0 and epoch < warmup_epochs and scheduler is None:
            lr_multiplier = (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group["lr"] = optimizer.param_groups[0]["lr"] * lr_multiplier

        # Record current learning rate
        current_lr = optimizer.param_groups[0]["lr"]
        learning_rates.append(current_lr)

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Step OneCycleLR scheduler here if being used
            if lr_scheduler_type == "one_cycle":
                scheduler.step()

            running_loss += loss.item()

        # Calculate average training loss
        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation loss
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test_tensor.to(device))
            val_loss = criterion(val_outputs, Y_test_tensor.to(device)).item()
            val_losses.append(val_loss)

        # Print progress
        if verbose and (epoch + 1) % 10 == 0:
            print(
                f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {val_loss:.6f}, LR: {current_lr:.8f}"
            )

        # Learning rate scheduler step (except for OneCycleLR which is done per iteration)
        if scheduler is not None:
            if lr_scheduler_type == "reduce_on_plateau":
                scheduler.step(val_loss)
            elif lr_scheduler_type == "cosine_annealing":
                scheduler.step()

        # Check for early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                if verbose:
                    print(f"Early stopping at epoch {epoch + 1}")
                break

    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Plot learning rate schedule
    plt.figure(figsize=(10, 4))
    plt.plot(learning_rates)
    plt.xlabel("Epochs")
    plt.ylabel("Learning Rate")
    plt.title("Learning Rate Schedule")
    plt.yscale("log")
    plt.savefig("freq_aware_results/learning_rate_schedule.png")
    plt.close()

    return model, train_losses, val_losses


# Modified evaluate_model function to handle scaling
def evaluate_model_with_scaling(
    model, X_test_tensor, Y_test_tensor, Y_test, components, device, y_scaler=None
):
    """Evaluate a trained model and calculate performance metrics."""
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor.to(device)).cpu().numpy()

    # Inverse transform if scaler was used
    if y_scaler is not None:
        predictions_original = y_scaler.inverse_transform(predictions)
        y_test_original = Y_test[components].values
    else:
        predictions_original = predictions
        y_test_original = Y_test[components].values

    # Calculate metrics
    metrics = {}

    for i, component in enumerate(components):
        y_true = y_test_original[:, i]
        y_pred = predictions_original[:, i]

        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)

        # Use SMAPE instead of MAPE for S12
        if "S12" in component or "S_deemb(1,2)" in component:
            smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
            metrics[component] = {
                "mse": mse,
                "rmse": rmse,
                "r2": r2,
                "mae": mae,
                "smape": smape_val,
            }
        else:
            # Regular MAPE for other S-parameters
            metrics[component] = {
                "mse": mse,
                "rmse": rmse,
                "r2": r2,
                "mae": mae,
                "mape": mean_absolute_percentage_error(y_true, y_pred),
            }

    # Calculate average metrics
    avg_metrics = {
        "rmse": np.mean([metrics[comp]["rmse"] for comp in components]),
        "r2": np.mean([metrics[comp]["r2"] for comp in components]),
        "mae": np.mean([metrics[comp]["mae"] for comp in components]),
    }

    # Add SMAPE or MAPE average depending on which components were evaluated
    if any("S12" in comp or "S_deemb(1,2)" in comp for comp in components):
        avg_metrics["smape"] = np.mean([metrics[comp]["smape"] for comp in components])
    else:
        avg_metrics["mape"] = np.mean([metrics[comp]["mape"] for comp in components])

    return metrics, avg_metrics, predictions_original

    return metrics, avg_metrics, predictions


def plot_learning_curves(train_losses, val_losses, model_name):
    """Plot the learning curves."""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Training Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title(f"Learning Curves for {model_name}")
    plt.legend()
    plt.savefig(f"freq_aware_results/learning_curves_{model_name}.png")
    plt.close()


def plot_predictions(Y_test, predictions, components, model_name):
    """Plot predictions vs actual values."""
    fig, axes = plt.subplots(1, len(components), figsize=(15, 5))

    for i, component in enumerate(components):
        ax = axes[i] if len(components) > 1 else axes
        y_true = Y_test[component].values
        y_pred = predictions[:, i]

        ax.scatter(y_true, y_pred, alpha=0.3)
        ax.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], "r--")
        ax.set_xlabel("Actual")
        ax.set_ylabel("Predicted")
        ax.set_title(f"{component}")

    plt.tight_layout()
    plt.savefig(f"freq_aware_results/predictions_{model_name}.png")
    plt.close()


def plot_error_distribution(Y_test, predictions, components, model_name):
    """Plot error distributions."""
    fig, axes = plt.subplots(1, len(components), figsize=(15, 5))

    for i, component in enumerate(components):
        ax = axes[i] if len(components) > 1 else axes
        y_true = Y_test[component].values
        y_pred = predictions[:, i]

        errors = y_pred - y_true

        sns.histplot(errors, kde=True, ax=ax)
        ax.set_xlabel("Prediction Error")
        ax.set_ylabel("Frequency")
        ax.set_title(f"{component} Error Distribution")

    plt.tight_layout()
    plt.savefig(f"freq_aware_results/error_dist_{model_name}.png")
    plt.close()


# Modified train_frequency_aware_models function
def train_frequency_aware_models(
    X_train, X_test, Y_train, Y_test, hyperparameters=None, selected_features=None
):
    """
    Train frequency-aware models for each S-parameter with conditional scaling.
    """
    # S-parameter definitions
    s_parameter_models = {
        "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
    }

    # 'S12': ['S_deemb(1,2)_real', 'S_deemb(1,2)_imag']

    # Set default hyperparameters if not provided
    if hyperparameters is None:
        hyperparameters = {
            "hidden_sizes": [64, 128, 256],
            "dropout_rate": 0.2,
            "learning_rate": 0.001,
            "batch_size": 256,
            "epochs": 150,
            "early_stopping_patience": 15,
            "activation": "gelu",
            "lr_scheduler_type": "one_cycle",
        }

    # Filter features if requested
    if selected_features is not None:
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]
        print(f"Using {len(selected_features)} selected features")

    # Check for GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Identify frequency-related features
    freq_indices, other_indices = identify_frequency_features(X_train.columns)

    # Store results and models
    models = {}
    all_results = {}
    all_predictions = {}
    scalers = {}  # Store scalers for each model

    # Record start time
    start_time = time.time()

    # Train a model for each S-parameter
    for model_name, components in s_parameter_models.items():
        print(f"\n{'=' * 50}")
        print(f"Training frequency-aware model for {model_name}")
        print(f"{'=' * 50}")

        # Decide whether to scale Y data (only for S12)
        scale_y = model_name == "S12"

        # Prepare data with conditional scaling
        prep_results = prepare_data_for_pytorch_with_scaling(
            X_train,
            Y_train,
            X_test,
            Y_test,
            components,
            hyperparameters["batch_size"],
            scale_y=scale_y,
        )

        if scale_y:
            (
                X_train_tensor,
                Y_train_tensor,
                X_test_tensor,
                Y_test_tensor,
                train_loader,
                y_scaler,
            ) = prep_results
            scalers[model_name] = y_scaler
            print("Applied StandardScaler to Y values for S12")
        else:
            (
                X_train_tensor,
                Y_train_tensor,
                X_test_tensor,
                Y_test_tensor,
                train_loader,
                _,
            ) = prep_results

        # Initialize model
        model = FrequencyAwareNetwork(
            len(freq_indices),
            len(other_indices),
            hyperparameters["hidden_sizes"],
            hyperparameters["dropout_rate"],
            hyperparameters.get("activation", "gelu"),
        )
        model.set_feature_indices(freq_indices, other_indices)

        # Loss and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=hyperparameters["learning_rate"])

        # Train model (use your existing train_model function)
        trained_model, train_losses, val_losses = train_model(
            model,
            train_loader,
            X_test_tensor,
            Y_test_tensor,
            criterion,
            optimizer,
            device,
            hyperparameters["epochs"],
            hyperparameters["early_stopping_patience"],
            lr_scheduler_type=hyperparameters.get("lr_scheduler_type", "one_cycle"),
        )

        # Plot learning curves
        plot_learning_curves(train_losses, val_losses, model_name)

        # Evaluate model with proper scaling handling
        metrics, avg_metrics, predictions = evaluate_model_with_scaling(
            trained_model,
            X_test_tensor,
            Y_test_tensor,
            Y_test,
            components,
            device,
            scalers.get(model_name),
        )

        # Plot predictions and error distributions
        plot_predictions(Y_test, predictions, components, model_name)
        plot_error_distribution(Y_test, predictions, components, model_name)

        # Print results
        print(f"\nPerformance metrics for {model_name}:")
        for component, metric in metrics.items():
            print(f"  {component}:")
            print(f"    RMSE: {metric['rmse']:.6f}")
            print(f"    R²: {metric['r2']:.6f}")
            print(f"    MAE: {metric['mae']:.6f}")
            if "smape" in metric:
                print(f"    SMAPE: {metric['smape']:.2f}%")
            else:
                print(f"    MAPE: {metric['mape']:.2f}%")

        print(f"\nAverage metrics for {model_name}:")
        print(f"  R²: {avg_metrics['r2']:.6f}")
        print(f"  RMSE: {avg_metrics['rmse']:.6f}")
        print(f"  MAE: {avg_metrics['mae']:.6f}")
        if "smape" in avg_metrics:
            print(f"  SMAPE: {avg_metrics['smape']:.2f}%")
        else:
            print(f"  MAPE: {avg_metrics['mape']:.2f}%")

        # Store results
        models[model_name] = trained_model
        all_results[model_name] = {
            "component_metrics": metrics,
            "avg_metrics": avg_metrics,
        }
        all_predictions[model_name] = predictions

    # Record total training time
    train_time = time.time() - start_time
    print(f"\nTotal training time: {train_time:.2f} seconds")

    # Save models
    for model_name, model in models.items():
        torch.save(model.state_dict(), f"freq_aware_results/{model_name}_model.pth")

    print("Models and results saved to freq_aware_results/")

    return models, all_results, all_predictions, scalers


# Function to experiment with different hyperparameters
def hyperparameter_tuning(X_train, X_test, Y_train, Y_test, param_grid):
    """
    Perform hyperparameter tuning by training models with different configurations.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Preprocessed feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    param_grid : dict
        Dictionary of hyperparameter values to try

    Returns:
    --------
    results : dict
        Dictionary of results for each configuration
    """
    results = {}

    # Generate all hyperparameter combinations
    param_keys = list(param_grid.keys())
    param_values = list(param_grid.values())

    def generate_combinations(index, current_params):
        if index == len(param_keys):
            # Train model with current parameter combination
            config_name = "_".join([f"{k}={v}" for k, v in current_params.items()])
            print(f"\n\n{'#' * 70}")
            print(f"# Testing configuration: {config_name}")
            print(f"{'#' * 70}\n")

            # Train models
            _, all_results, _ = train_frequency_aware_models(
                X_train, X_test, Y_train, Y_test, hyperparameters=current_params
            )

            # Store results
            avg_r2 = np.mean(
                [result["avg_metrics"]["r2"] for result in all_results.values()]
            )
            results[config_name] = {
                "params": current_params.copy(),
                "avg_r2": avg_r2,
                "detailed_results": all_results,
            }
            return

        # Recursive exploration of parameter combinations
        for value in param_values[index]:
            current_params[param_keys[index]] = value
            generate_combinations(index + 1, current_params)

    # Start generating combinations
    generate_combinations(0, {})

    # Rank results
    ranked_results = sorted(results.items(), key=lambda x: x[1]["avg_r2"], reverse=True)

    # Print summary
    print("\n\n" + "=" * 80)
    print("HYPERPARAMETER TUNING RESULTS")
    print("=" * 80)

    for i, (config_name, result) in enumerate(ranked_results):
        print(f"\n{i + 1}. Configuration: {config_name}")
        print(f"   Average R²: {result['avg_r2']:.6f}")
        print(f"   Parameters: {result['params']}")

    return results


# Function to test different feature subsets
def feature_selection_experiment(X_train, X_test, Y_train, Y_test, feature_sets):
    """
    Test different feature subsets to find optimal combinations.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Complete feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    feature_sets : dict
        Dictionary mapping set names to lists of feature columns

    Returns:
    --------
    results : dict
        Dictionary of results for each feature set
    """
    results = {}

    for set_name, features in feature_sets.items():
        print(f"\n\n{'#' * 70}")
        print(f"# Testing feature set: {set_name} ({len(features)} features)")
        print(f"{'#' * 70}\n")

        # Train models with this feature set
        _, all_results, _ = train_frequency_aware_models(
            X_train, X_test, Y_train, Y_test, selected_features=features
        )

        # Store results
        avg_r2 = np.mean(
            [result["avg_metrics"]["r2"] for result in all_results.values()]
        )
        results[set_name] = {
            "features": features,
            "feature_count": len(features),
            "avg_r2": avg_r2,
            "detailed_results": all_results,
        }

    # Rank results
    ranked_results = sorted(results.items(), key=lambda x: x[1]["avg_r2"], reverse=True)

    # Print summary
    print("\n\n" + "=" * 80)
    print("FEATURE SELECTION RESULTS")
    print("=" * 80)

    for i, (set_name, result) in enumerate(ranked_results):
        print(f"\n{i + 1}. Feature Set: {set_name}")
        print(f"   Features: {len(result['features'])}")
        print(f"   Average R²: {result['avg_r2']:.6f}")

    return results


# Example usage


# Example of running with all features and default hyperparameters
# models, results, predictions = train_frequency_aware_models(
#     X_train, X_test, Y_raw_train, Y_raw_test,
#     hyperparameters=default_hyperparameters
# )

# Example of hyperparameter tuning
# param_grid = {
#     'learning_rate': [0.0001, 0.001, 0.01],
#     'dropout_rate': [0.1, 0.2, 0.3],
#     'batch_size': [128, 256, 512]
# }
# tuning_results = hyperparameter_tuning(X_train, X_test, Y_raw_train, Y_raw_test, param_grid)

# Example of feature selection experiment
# core_features = ['freq', 'vb', 'vc', 'gm_abs_log']
# freq_features = [col for col in X_train.columns if 'freq' in col]
# impedance_features = [col for col in X_train.columns if 'Zin' in col or 'Zout' in col]

# feature_sets = {
#     'all_features': X_train.columns.tolist(),
#     'frequency_only': freq_features,
#     'core_plus_frequency': core_features + freq_features,
#     'core_plus_impedance': core_features + impedance_features,
#     'optimized_set': ['freq', 'freq_log', 'freq_log_norm', 'vb', 'vc', 'gm_abs_log',
#                       'Zin_real_log', 'Zin_imag_log', 'Zout_real_log']
# }
# feature_results = feature_selection_experiment(X_train, X_test, Y_raw_train, Y_raw_test, feature_sets)

In [None]:
best_hyperparameters = {
    "hidden_sizes": [384, 768, 1536, 768, 384],
    "dropout_rate": 0.1,
    "learning_rate": 0.002,
    "batch_size": 512,
    "epochs": 300,
    "early_stopping_patience": 40,
    "activation": "gelu",
    "lr_scheduler_type": "reduce_on_plateau",
}

# Train with scaling for S12
models, results, predictions, scalers = train_frequency_aware_models(
    X_train, X_test, Y_raw_train, Y_raw_test, hyperparameters=best_hyperparameters
)

# You can also save the scalers for future use
joblib.dump(scalers, "freq_aware_results/all_scalers.pkl")

In [None]:
best_hyperparameters = {
    "learning_rate": 0.002,
    "dropout_rate": 0.1,
    "batch_size": 512,
    "epochs": 200,
    "early_stopping_patience": 30,
    "hidden_sizes": [256, 512, 1024, 512],
    "lr_scheduler_type": "reduce_on_plateau",
    "activation": "gelu",
}

# Train with scaling for S11
models, results, predictions, scalers = train_frequency_aware_models(
    X_train, X_test, Y_raw_train, Y_raw_test, hyperparameters=best_hyperparameters
)

# You can also save the scalers for future use
joblib.dump(scalers, "freq_aware_results/all_scalers.pkl")

In [None]:
import itertools
from copy import deepcopy


def hyperparameter_tuning2(
    X_train,
    X_test,
    Y_train,
    Y_test,
    param_grid,
    s_parameter=None,
    selected_features=None,
    seed=42,
):
    """
    Perform hyperparameter tuning by training models with different configurations.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Preprocessed feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    param_grid : dict
        Dictionary of hyperparameter values to try
    s_parameter : dict or None
        Dictionary mapping model names to component lists (e.g., {'S21': [...]})
    selected_features : list or None
        List of feature columns to use
    seed : int
        Random seed for reproducibility

    Returns:
    --------
    results : dict
        Dictionary of results for each configuration
    best_config : dict
        Best hyperparameter configuration
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    torch.manual_seed(seed)

    results = {}
    best_score = -float("inf")
    best_config = None
    best_model = None

    # Generate all hyperparameter combinations
    param_names = list(param_grid.keys())
    param_values = list(param_grid.values())

    # Create all combinations
    combinations = list(itertools.product(*param_values))

    print(f"\n{'#' * 70}")
    print(f"# Starting hyperparameter tuning with {len(combinations)} configurations")
    print(f"{'#' * 70}\n")

    for i, combination in enumerate(combinations):
        # Create hyperparameter dictionary
        hyperparameters = dict(zip(param_names, combination))

        # Create config name for identification
        config_name = "_".join([f"{k}={v}" for k, v in hyperparameters.items()])
        print(f"\n{'-' * 70}")
        print(f"Configuration {i + 1}/{len(combinations)}: {config_name}")
        print(f"{'-' * 70}")

        try:
            # Train models with current configuration
            # Remove s_parameters and seed parameters as train_frequency_aware_models doesn't accept them
            models, all_results, all_predictions, scalers = (
                train_frequency_aware_models(
                    X_train,
                    X_test,
                    Y_train,
                    Y_test,
                    hyperparameters=hyperparameters,
                    selected_features=selected_features,
                )
            )

            # Calculate overall score (using average R² as primary metric)
            r2_scores = []
            for model_name, result in all_results.items():
                r2_scores.append(result["avg_metrics"]["r2"])

            avg_r2 = np.mean(r2_scores)

            # Store results
            results[config_name] = {
                "hyperparameters": hyperparameters.copy(),
                "avg_r2": avg_r2,
                "detailed_results": deepcopy(all_results),
                "models": models,
                "scalers": scalers,
            }

            print(f"\nConfiguration {i + 1} results:")
            print(f"  Average R²: {avg_r2:.6f}")

            # Update best configuration
            if avg_r2 > best_score:
                best_score = avg_r2
                best_config = hyperparameters.copy()
                best_model = models
                print("  ** New best configuration! **")

        except Exception as e:
            print(f"Error in configuration {i + 1}: {str(e)}")
            results[config_name] = {
                "hyperparameters": hyperparameters.copy(),
                "error": str(e),
            }

    # Print final summary
    print("\n" + "=" * 80)
    print("HYPERPARAMETER TUNING SUMMARY")
    print("=" * 80)

    # Rank results - only process valid results
    valid_results = {k: v for k, v in results.items() if "error" not in v}

    if valid_results:  # Check if there are any valid results
        ranked_results = sorted(
            valid_results.items(), key=lambda x: x[1]["avg_r2"], reverse=True
        )

        for i, (config_name, result) in enumerate(ranked_results[:5]):  # Show top 5
            print(f"\n{i + 1}. Configuration: {config_name}")
            print(f"   Average R²: {result['avg_r2']:.6f}")
            print("   Hyperparameters:")
            for k, v in result["hyperparameters"].items():
                print(f"     {k}: {v}")

        print(f"\nBest overall configuration achieves R² = {best_score:.6f}")

        # Save detailed results to CSV
        summary_data = []
        for config_name, result in valid_results.items():
            row = {"configuration": config_name, "avg_r2": result["avg_r2"]}
            row.update(result["hyperparameters"])
            summary_data.append(row)

        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values("avg_r2", ascending=False)
        summary_df.to_csv(
            "freq_aware_results/hyperparameter_tuning_results.csv", index=False
        )
    else:
        print("\nNo valid configurations completed successfully!")
        print("All configurations resulted in errors.")

    return results, best_config


def create_architecture_grid():
    """
    Create a predefined grid of architecture patterns to test.
    """
    return {
        "symmetric_pyramid_small": [128, 256, 512, 256, 128],
        "symmetric_pyramid_medium": [256, 512, 1024, 512, 256],
        "symmetric_pyramid_large": [384, 768, 1536, 768, 384],
        "progressive_narrow": [64, 128, 256, 512, 1024],
        "progressive_wide": [128, 256, 512, 1024, 2048],
        "deep_narrow": [128, 256, 512, 512, 512, 256, 128],
        "deep_wide": [256, 512, 1024, 1024, 1024, 512, 256],
        "wide_shallow": [512, 1024, 512],
        "hourglass": [512, 1024, 256, 1024, 512],
        "alternating": [256, 512, 256, 512, 256],
        "compact": [256, 256, 512, 512],
        "simple_deep": [256, 512, 1024, 1024, 512, 256],
    }


def test_all_s_parameters_architectures(
    X_train, X_test, Y_train, Y_test, s_parameters_to_test=None, architecture_grid=None
):
    """
    Test different architectures for all specified S-parameters.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Preprocessed feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    s_parameters_to_test : dict or None
        Dictionary of S-parameters to test. If None, tests all.
    architecture_grid : dict or None
        Dictionary of architectures to test. If None, uses default grid.

    Returns:
    --------
    all_results : dict
        Dictionary of results for each S-parameter
    best_configs : dict
        Dictionary of best configurations for each S-parameter
    overall_best : dict
        Overall best configuration across all S-parameters
    """
    # Default S-parameter models
    if s_parameters_to_test is None:
        s_parameters_to_test = {
            "S11": ["S_deemb(1,1)_real", "S_deemb(1,1)_imag"],
            "S12": ["S_deemb(1,2)_real", "S_deemb(1,2)_imag"],
            "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
            "S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"],
        }

    # Default architecture grid
    if architecture_grid is None:
        architecture_grid = create_architecture_grid()

    # Create parameter grid with architectures
    param_grid = {
        "hidden_sizes": list(architecture_grid.values()),
        "dropout_rate": [0.1],
        "learning_rate": [0.002],
        "activation": ["gelu"],
        "lr_scheduler_type": ["reduce_on_plateau"],
        "batch_size": [512],
        "epochs": [300],
        "early_stopping_patience": [40],
    }

    all_results = {}
    best_configs = {}
    best_r2_scores = {}

    # Test each S-parameter individually
    for s_param_name, s_param_components in s_parameters_to_test.items():
        print(f"\n{'=' * 80}")
        print(f"TESTING ARCHITECTURES FOR {s_param_name}")
        print(f"{'=' * 80}")

        # Create single S-parameter configuration
        s_param_single = {s_param_name: s_param_components}

        # Run hyperparameter tuning
        results, best_config = hyperparameter_tuning2(
            X_train,
            X_test,
            Y_train,
            Y_test,
            param_grid=param_grid,
            s_parameter=s_param_single,
            seed=42,
        )

        # Store results
        all_results[s_param_name] = results
        best_configs[s_param_name] = best_config

        # Extract best R² score
        valid_results = {k: v for k, v in results.items() if "error" not in v}
        if valid_results:
            best_r2 = max(r["avg_r2"] for r in valid_results.values())
            best_r2_scores[s_param_name] = best_r2

        # Analyze architecture performance for this S-parameter
        analysis = analyze_architecture_performance(results, s_param_name)

    # Find overall best configuration
    overall_best_r2 = -float("inf")
    overall_best_config = None
    overall_best_s_param = None

    for s_param, r2_score in best_r2_scores.items():
        if r2_score > overall_best_r2:
            overall_best_r2 = r2_score
            overall_best_config = best_configs[s_param]
            overall_best_s_param = s_param

    overall_best = {
        "s_parameter": overall_best_s_param,
        "r2_score": overall_best_r2,
        "config": overall_best_config,
    }

    # Create comprehensive summary
    create_comprehensive_summary(
        all_results, best_configs, best_r2_scores, architecture_grid
    )

    return all_results, best_configs, overall_best


def analyze_architecture_performance(tuning_results, s_param_name):
    """
    Analyze architecture performance for a specific S-parameter.
    """
    architecture_performance = []

    # Extract valid results
    valid_results = {k: v for k, v in tuning_results.items() if "error" not in v}

    # Check if there are any valid results
    if not valid_results:
        print(f"\nNo valid results to analyze for {s_param_name}")
        return pd.DataFrame()  # Return empty DataFrame

    for config_name, result in valid_results.items():
        try:
            arch_data = {
                "configuration": config_name,
                "avg_r2": result["avg_r2"],
                "hidden_sizes": result["hyperparameters"]["hidden_sizes"],
                "dropout_rate": result["hyperparameters"]["dropout_rate"],
                "learning_rate": result["hyperparameters"]["learning_rate"],
                "activation": result["hyperparameters"]["activation"],
                "num_layers": len(result["hyperparameters"]["hidden_sizes"]),
                "total_params": sum(result["hyperparameters"]["hidden_sizes"]),
                "architecture_type": categorize_architecture(
                    result["hyperparameters"]["hidden_sizes"]
                ),
            }
            architecture_performance.append(arch_data)
        except KeyError as e:
            print(f"Warning: Missing key in configuration {config_name}: {e}")
            continue

    # Check if we have any data to analyze
    if not architecture_performance:
        print(f"\nNo valid architecture performance data for {s_param_name}")
        return pd.DataFrame()

    # Convert to DataFrame for analysis
    arch_df = pd.DataFrame(architecture_performance)

    try:
        # Group by architecture type
        arch_summary = (
            arch_df.groupby("architecture_type")
            .agg({"avg_r2": ["mean", "max", "min", "count"]})
            .round(6)
        )

        print(f"\nArchitecture Type Performance Summary for {s_param_name}:")
        print(arch_summary)
    except KeyError as e:
        print(f"Warning: Unable to group data: {e}")
        print(arch_df.columns if not arch_df.empty else "Empty DataFrame")

    # Save detailed analysis
    arch_df.to_csv(
        f"freq_aware_results/{s_param_name}_architecture_analysis.csv", index=False
    )

    # Visualize architecture performance if we have data
    if not arch_df.empty:
        plot_architecture_performance(arch_df, s_param_name)

    return arch_df


def categorize_architecture(hidden_sizes):
    """
    Categorize architecture based on pattern.
    """
    if len(hidden_sizes) == 3:
        return "simple"
    elif len(hidden_sizes) == 5:
        if hidden_sizes[0] == hidden_sizes[-1]:
            return "symmetric_pyramid"
        else:
            return "asymmetric"
    elif len(hidden_sizes) == 6:
        if all(i <= j for i, j in zip(hidden_sizes, hidden_sizes[1:])):
            return "progressive"
        else:
            return "deep_mixed"
    else:
        return f"custom_{len(hidden_sizes)}_layers"


def plot_architecture_performance(arch_df, s_param_name):
    """
    Plot architecture performance visualization for a specific S-parameter.
    """
    plt.figure(figsize=(15, 8))

    # Box plot by architecture type
    plt.subplot(1, 2, 1)
    sns.boxplot(data=arch_df, x="architecture_type", y="avg_r2")
    plt.title(f"R² Performance by Architecture Type ({s_param_name})")
    plt.xticks(rotation=45)

    # Scatter plot: Total parameters vs R²
    plt.subplot(1, 2, 2)
    scatter = plt.scatter(
        arch_df["total_params"],
        arch_df["avg_r2"],
        c=arch_df["num_layers"],
        cmap="viridis",
        alpha=0.7,
    )
    plt.colorbar(scatter, label="Number of Layers")
    plt.xlabel("Total Parameters")
    plt.ylabel("Average R²")
    plt.title(f"Model Size vs Performance ({s_param_name})")

    plt.tight_layout()
    plt.savefig(
        f"freq_aware_results/{s_param_name}_architecture_performance_analysis.png",
        dpi=300,
    )
    plt.close()

    # Heatmap of hyperparameter combinations
    plt.figure(figsize=(12, 8))
    pivot_data = arch_df.pivot_table(
        values="avg_r2", index="activation", columns="dropout_rate", aggfunc="mean"
    )
    sns.heatmap(pivot_data, annot=True, fmt=".4f", cmap="viridis")
    plt.title(f"Average R² by Activation and Dropout Rate ({s_param_name})")
    plt.tight_layout()
    plt.savefig(
        f"freq_aware_results/{s_param_name}_hyperparameter_heatmap.png", dpi=300
    )
    plt.close()


def create_comprehensive_summary(
    all_results, best_configs, best_r2_scores, architecture_grid
):
    """
    Create a comprehensive summary of all S-parameter tuning results.
    """
    # Check if we have any valid results
    if not best_r2_scores:
        print("\nNo valid results to summarize!")
        return

    # Create summary DataFrame
    summary_data = []

    for s_param in all_results.keys():
        row = {
            "S_Parameter": s_param,
            "Best_R2": best_r2_scores.get(s_param, np.nan),
            "Best_Architecture": str(best_configs[s_param]["hidden_sizes"])
            if s_param in best_configs
            else None,
            "Best_Dropout": best_configs[s_param]["dropout_rate"]
            if s_param in best_configs
            else None,
            "Best_LR": best_configs[s_param]["learning_rate"]
            if s_param in best_configs
            else None,
            "Best_Activation": best_configs[s_param]["activation"]
            if s_param in best_configs
            else None,
            "Best_Scheduler": best_configs[s_param]["lr_scheduler_type"]
            if s_param in best_configs
            else None,
        }
        summary_data.append(row)

    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values("Best_R2", ascending=False)
        summary_df.to_csv(
            "freq_aware_results/all_s_parameters_best_configs.csv", index=False
        )

        print("\n" + "=" * 80)
        print("BEST CONFIGURATIONS FOR ALL S-PARAMETERS")
        print("=" * 80)
        print(summary_df.to_string(index=False))

        # Create comparative visualization
        plot_comparative_summary(summary_df, best_configs)
    else:
        print("\nNo summary data to save!")


def plot_comparative_summary(summary_df, best_configs):
    """
    Create comparative visualizations across all S-parameters.
    """
    # Figure 1: Performance comparison
    plt.figure(figsize=(10, 6))
    bars = plt.bar(
        summary_df["S_Parameter"], summary_df["Best_R2"], alpha=0.8, color="skyblue"
    )
    plt.xlabel("S-Parameter")
    plt.ylabel("Best R² Score")
    plt.title("Best Performance for Each S-Parameter")
    plt.ylim(0, 1)

    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2.0,
            height,
            f"{height:.4f}",
            ha="center",
            va="bottom",
        )

    plt.tight_layout()
    plt.savefig(
        "freq_aware_results/all_s_parameters_performance_comparison.png", dpi=300
    )
    plt.close()

    # Figure 2: Architecture patterns used
    plt.figure(figsize=(12, 8))
    arch_counts = {}
    for s_param in best_configs:
        arch_type = categorize_architecture(best_configs[s_param]["hidden_sizes"])
        arch_counts[arch_type] = arch_counts.get(arch_type, 0) + 1

    plt.pie(arch_counts.values(), labels=arch_counts.keys(), autopct="%1.1f%%")
    plt.title("Most Successful Architecture Types Across All S-Parameters")
    plt.tight_layout()
    plt.savefig("freq_aware_results/architecture_distribution.png", dpi=300)
    plt.close()


# Usage example with specific S-parameters
def run_comprehensive_architecture_testing():
    """
    Run comprehensive architecture testing for all or selected S-parameters.
    """
    # Example 1: Test all S-parameters
    all_results, best_configs, overall_best = test_all_s_parameters_architectures(
        X_train, X_test, Y_raw_train, Y_raw_test
    )

    # Example 2: Test specific S-parameters
    selected_s_params = {
        "S12": ["S_deemb(1,2)_real", "S_deemb(1,2)_imag"],
        "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
    }

    selected_results, selected_best_configs, selected_overall_best = (
        test_all_s_parameters_architectures(
            X_train,
            X_test,
            Y_raw_train,
            Y_raw_test,
            s_parameters_to_test=selected_s_params,
        )
    )

    return all_results, best_configs, overall_best


# Custom architecture testing
def run_custom_architectures():
    """
    Test custom architecture patterns.
    """
    # Define custom architectures
    custom_architectures = {
        "ultra_deep": [128] * 8,  # 8 layers of same size
        "funnel": [1024, 512, 256, 128, 64, 32],  # Decreasing
        "reverse_funnel": [32, 64, 128, 256, 512, 1024],  # Increasing
        "v_shape": [512, 256, 128, 256, 512],  # V pattern
        "diamond": [256, 512, 1024, 2048, 1024, 512, 256],  # Wide middle
    }

    all_results, best_configs, overall_best = test_all_s_parameters_architectures(
        X_train, X_test, Y_raw_train, Y_raw_test, architecture_grid=custom_architectures
    )

    return all_results, best_configs, overall_best


In [None]:
# Test only S12 and S21
selected_s_params = {"S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"]}

# Define custom architectures
custom_architectures = {
    # Basic/Simple
    "simple": [512, 512, 512],
    # Progressive (Increasing)
    "progressive": [128, 256, 512, 1024],
    # Reverse Progressive (Decreasing)
    "reverse_progressive": [1024, 512, 256, 128],
    # Symmetric Pyramid
    "sym_pyramid": [256, 512, 1024, 512, 256],
    # Asymmetric Pyramid
    "asym_pyramid": [256, 512, 1024, 512, 256, 128],
    # Deep Mixed
    "deep_mixed": [256, 512, 256, 1024, 512, 256],
    # V-Shape
    "v_shape": [512, 256, 128, 256, 512],
    # Inverted V-Shape
    "inv_v_shape": [128, 256, 512, 256, 128],
    # Hourglass
    "hourglass": [512, 256, 128, 256, 512],
    # Alternating
    "alternating": [256, 512, 256, 512, 256],
    # Compact
    "compact": [512, 1024, 512],
    # Ultra-Deep
    "ultra_deep": [256, 256, 256, 256, 256, 256],
    # Wide Shallow
    "wide_shallow": [1024, 2048, 1024],
    # Funnel
    "funnel": [1024, 512, 256, 128],
    # Reverse Funnel
    "reverse_funnel": [128, 256, 512, 1024],
    # Diamond
    "diamond": [256, 512, 1024, 512, 256],
    # Step Pattern
    "step": [256, 256, 512, 512, 1024],
    # Zigzag
    "zigzag": [256, 512, 256, 512, 256],
    # Exponential Growth
    "exponential": [128, 256, 512, 1024, 2048],
}

results, best_configs, overall_best = test_all_s_parameters_architectures(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    s_parameters_to_test=selected_s_params,
    architecture_grid=custom_architectures,
)

In [None]:
# Test only S12 and S21
selected_s_params = {"S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"]}

# Define custom architectures
custom_architectures = {
    # Best performer and variations
    "wide_best": [1024, 2048, 1024],
    "ultra_wide": [1024, 4096, 1024],
    "extra_wide": [2048, 4096, 2048],
    # Explore intermediate widths
    "wide_medium_1": [768, 1536, 768],
    "wide_medium_2": [1536, 3072, 1536],
    # Slight asymmetry variations of best
    "asym_wide_1": [512, 2048, 1024],
    "asym_wide_2": [1024, 3072, 2048],
    "asym_wide_3": [2048, 4096, 1024],
    # Four-layer wide patterns
    "wide_4layer_1": [512, 1024, 2048, 1024],
    "wide_4layer_2": [1024, 2048, 2048, 1024],
    "wide_4layer_3": [512, 2048, 2048, 512],
    # Five-layer variations maintaining width
    "wide_5layer_1": [512, 1024, 2048, 1024, 512],
    "wide_5layer_2": [1024, 1024, 2048, 1024, 1024],
    # Alternative wide patterns
    "wide_alt_1": [1024, 1536, 1024],
    "wide_alt_2": [2048, 3072, 2048],
    "wide_alt_3": [1024, 2048, 1536, 1024],
    # Exploration beyond widths - depth variations
    "deep_wide_1": [1024, 1024, 1024, 1024],
    "deep_wide_2": [1536, 1536, 1536],
    "deep_wide_3": [2048, 2048, 2048],
    # Compact wide variations
    "compact_wide_1": [2048, 4096, 2048],
    "compact_wide_2": [3072, 6144, 3072],
}

results, best_configs, overall_best = test_all_s_parameters_architectures(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    s_parameters_to_test=selected_s_params,
    architecture_grid=custom_architectures,
)

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn

# Create directory for results
os.makedirs("freq_aware_results", exist_ok=True)


# Define SMAPE function for better handling of small values
def symmetric_mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    """Calculate SMAPE with protection against division by zero."""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0 + epsilon
    numerator = np.abs(y_true - y_pred)
    smape = numerator / denominator
    return np.mean(smape) * 100


# Define mean absolute percentage error function
def mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    """Calculate MAPE with protection against division by zero."""
    non_zero = np.abs(y_true) > epsilon
    if non_zero.sum() == 0:
        return np.nan
    percentage_errors = (
        np.abs(
            (y_true[non_zero] - y_pred[non_zero]) / (np.abs(y_true[non_zero]) + epsilon)
        )
        * 100
    )
    return np.mean(percentage_errors)


# Frequency-aware neural network
class FrequencyAwareNetwork(nn.Module):
    def __init__(
        self,
        freq_features,
        other_features,
        hidden_sizes=[64, 128, 256],
        dropout_rate=0.2,
        activation="silu",
    ):
        super().__init__()

        if activation == "silu":
            activation_fn = nn.SiLU()
        elif activation == "relu":
            activation_fn = nn.ReLU()
        elif activation == "gelu":
            activation_fn = nn.GELU()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")

        # Frequency-specific processing branch
        freq_layers = []
        prev_size = freq_features
        for h_size in hidden_sizes[:2]:  # First two hidden sizes for branches
            freq_layers.append(nn.Linear(prev_size, h_size))
            freq_layers.append(
                activation_fn
            )  # Using SiLU (Swish) activation for better performance
            freq_layers.append(nn.BatchNorm1d(h_size))
            freq_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        self.freq_branch = nn.Sequential(*freq_layers)

        # Other parameters branch
        other_layers = []
        prev_size = other_features
        for h_size in hidden_sizes[:2]:
            other_layers.append(nn.Linear(prev_size, h_size))
            other_layers.append(activation_fn)
            other_layers.append(nn.BatchNorm1d(h_size))
            other_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        self.other_branch = nn.Sequential(*other_layers)

        # Combined processing with residual connections
        combined_layers = []
        prev_size = hidden_sizes[1] * 2  # Output size from both branches combined

        for h_size in hidden_sizes[2:]:
            combined_layers.append(nn.Linear(prev_size, h_size))
            combined_layers.append(activation_fn)
            combined_layers.append(nn.BatchNorm1d(h_size))
            combined_layers.append(nn.Dropout(dropout_rate))
            prev_size = h_size

        # Final output layer for real and imaginary components
        combined_layers.append(nn.Linear(prev_size, 2))

        self.combined = nn.Sequential(*combined_layers)

        # Store feature indices for processing
        self.freq_indices = None
        self.other_indices = None

    def forward(self, x):
        # Split input into frequency and other features
        if self.freq_indices is None or self.other_indices is None:
            raise ValueError(
                "Feature indices not set. Call set_feature_indices() first."
            )

        freq_input = x[:, self.freq_indices]
        other_input = x[:, self.other_indices]

        # Process through branches
        freq_features = self.freq_branch(freq_input)
        other_features = self.other_branch(other_input)

        # Combine and output
        combined = torch.cat([freq_features, other_features], dim=1)
        return self.combined(combined)

    def set_feature_indices(self, freq_indices, other_indices):
        """Set indices for frequency and other features."""
        self.freq_indices = freq_indices
        self.other_indices = other_indices


# Helper function to identify frequency-related features
def identify_frequency_features(X_columns):
    """Identify frequency-related features in the dataset."""
    freq_features = [
        i
        for i, col in enumerate(X_columns)
        if "freq" in col.lower() or "band" in col.lower()
    ]
    other_features = [i for i in range(len(X_columns)) if i not in freq_features]

    print(
        f"Identified {len(freq_features)} frequency-related features and {len(other_features)} other features"
    )
    return freq_features, other_features


# Modified prepare_data_for_pytorch to handle scaling
def prepare_data_for_pytorch_with_scaling(
    X_train, Y_train, X_test, Y_test, components, batch_size=128, scale_y=True
):
    """Prepare data for PyTorch models with optional Y-scaling."""

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train.values)
    X_test_tensor = torch.FloatTensor(X_test.values)

    # Handle Y data scaling if requested
    if scale_y:
        # Create scaler for Y values
        y_scaler = StandardScaler()
        Y_train_values = Y_train[components].values
        Y_test_values = Y_test[components].values

        # Fit scaler and transform data
        Y_train_scaled = y_scaler.fit_transform(Y_train_values)
        Y_test_scaled = y_scaler.transform(Y_test_values)

        # Convert to tensors
        Y_train_tensor = torch.FloatTensor(Y_train_scaled)
        Y_test_tensor = torch.FloatTensor(Y_test_scaled)

        # Save scaler for later use
        component_str = "_".join(components)
        joblib.dump(y_scaler, f"freq_aware_results/{component_str}_scaler.pkl")

        # Create data loaders
        train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        return (
            X_train_tensor,
            Y_train_tensor,
            X_test_tensor,
            Y_test_tensor,
            train_loader,
            y_scaler,
        )

    else:
        # No scaling
        Y_train_tensor = torch.FloatTensor(Y_train[components].values)
        Y_test_tensor = torch.FloatTensor(Y_test[components].values)

        # Create data loaders
        train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        return (
            X_train_tensor,
            Y_train_tensor,
            X_test_tensor,
            Y_test_tensor,
            train_loader,
            None,
        )


def train_model(
    model,
    train_loader,
    X_test_tensor,
    Y_test_tensor,
    criterion,
    optimizer,
    device,
    epochs=100,
    early_stopping_patience=15,
    verbose=True,
    lr_scheduler_type="reduce_on_plateau",
    warmup_epochs=5,
):
    """Train a PyTorch model with early stopping and learning rate scheduling."""
    model = model.to(device)

    # Set up learning rate scheduler based on specified type
    if lr_scheduler_type == "reduce_on_plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=0.85, patience=5, verbose=verbose, min_lr=5e-7
        )
    elif lr_scheduler_type == "cosine_annealing":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=epochs, eta_min=1e-6
        )
    elif lr_scheduler_type == "one_cycle":
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=optimizer.param_groups[0]["lr"],
            steps_per_epoch=len(train_loader),
            epochs=epochs,
        )
    else:
        scheduler = None

    # For early stopping
    best_loss = float("inf")
    best_model_state = None
    patience_counter = 0

    # Track losses and learning rates for plotting
    train_losses = []
    val_losses = []
    learning_rates = []

    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Apply learning rate warmup if needed
        if warmup_epochs > 0 and epoch < warmup_epochs and scheduler is None:
            lr_multiplier = (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group["lr"] = optimizer.param_groups[0]["lr"] * lr_multiplier

        # Record current learning rate
        current_lr = optimizer.param_groups[0]["lr"]
        learning_rates.append(current_lr)

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Step OneCycleLR scheduler here if being used
            if lr_scheduler_type == "one_cycle":
                scheduler.step()

            running_loss += loss.item()

        # Calculate average training loss
        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation loss
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test_tensor.to(device))
            val_loss = criterion(val_outputs, Y_test_tensor.to(device)).item()
            val_losses.append(val_loss)

        # Print progress
        if verbose and (epoch + 1) % 10 == 0:
            print(
                f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {val_loss:.6f}, LR: {current_lr:.8f}"
            )

        # Learning rate scheduler step (except for OneCycleLR which is done per iteration)
        if scheduler is not None:
            if lr_scheduler_type == "reduce_on_plateau":
                scheduler.step(val_loss)
            elif lr_scheduler_type == "cosine_annealing":
                scheduler.step()

        # Check for early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                if verbose:
                    print(f"Early stopping at epoch {epoch + 1}")
                break

    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Plot learning rate schedule
    plt.figure(figsize=(10, 4))
    plt.plot(learning_rates)
    plt.xlabel("Epochs")
    plt.ylabel("Learning Rate")
    plt.title("Learning Rate Schedule")
    plt.yscale("log")
    plt.savefig("freq_aware_results/learning_rate_schedule.png")
    plt.close()

    return model, train_losses, val_losses


# Modified evaluate_model function to handle scaling
def evaluate_model_with_scaling(
    model, X_test_tensor, Y_test_tensor, Y_test, components, device, y_scaler=None
):
    """Evaluate a trained model and calculate performance metrics."""
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor.to(device)).cpu().numpy()

    # Inverse transform if scaler was used
    if y_scaler is not None:
        predictions_original = y_scaler.inverse_transform(predictions)
        y_test_original = Y_test[components].values
    else:
        predictions_original = predictions
        y_test_original = Y_test[components].values

    # Calculate metrics
    metrics = {}

    for i, component in enumerate(components):
        y_true = y_test_original[:, i]
        y_pred = predictions_original[:, i]

        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)

        # Use SMAPE instead of MAPE for S12
        if "S12" in component or "S_deemb(1,2)" in component:
            smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
            metrics[component] = {
                "mse": mse,
                "rmse": rmse,
                "r2": r2,
                "mae": mae,
                "smape": smape_val,
            }
        else:
            # Regular MAPE for other S-parameters
            metrics[component] = {
                "mse": mse,
                "rmse": rmse,
                "r2": r2,
                "mae": mae,
                "mape": mean_absolute_percentage_error(y_true, y_pred),
            }

    # Calculate average metrics
    avg_metrics = {
        "rmse": np.mean([metrics[comp]["rmse"] for comp in components]),
        "r2": np.mean([metrics[comp]["r2"] for comp in components]),
        "mae": np.mean([metrics[comp]["mae"] for comp in components]),
    }

    # Add SMAPE or MAPE average depending on which components were evaluated
    if any("S12" in comp or "S_deemb(1,2)" in comp for comp in components):
        avg_metrics["smape"] = np.mean([metrics[comp]["smape"] for comp in components])
    else:
        avg_metrics["mape"] = np.mean([metrics[comp]["mape"] for comp in components])

    return metrics, avg_metrics, predictions_original

    return metrics, avg_metrics, predictions


def plot_learning_curves(train_losses, val_losses, model_name):
    """Plot the learning curves."""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Training Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title(f"Learning Curves for {model_name}")
    plt.legend()
    plt.savefig(f"freq_aware_results/learning_curves_{model_name}.png")
    plt.close()


def plot_predictions(Y_test, predictions, components, model_name):
    """Plot predictions vs actual values."""
    fig, axes = plt.subplots(1, len(components), figsize=(15, 5))

    for i, component in enumerate(components):
        ax = axes[i] if len(components) > 1 else axes
        y_true = Y_test[component].values
        y_pred = predictions[:, i]

        ax.scatter(y_true, y_pred, alpha=0.3)
        ax.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], "r--")
        ax.set_xlabel("Actual")
        ax.set_ylabel("Predicted")
        ax.set_title(f"{component}")

    plt.tight_layout()
    plt.savefig(f"freq_aware_results/predictions_{model_name}.png")
    plt.close()


def plot_error_distribution(Y_test, predictions, components, model_name):
    """Plot error distributions."""
    fig, axes = plt.subplots(1, len(components), figsize=(15, 5))

    for i, component in enumerate(components):
        ax = axes[i] if len(components) > 1 else axes
        y_true = Y_test[component].values
        y_pred = predictions[:, i]

        errors = y_pred - y_true

        sns.histplot(errors, kde=True, ax=ax)
        ax.set_xlabel("Prediction Error")
        ax.set_ylabel("Frequency")
        ax.set_title(f"{component} Error Distribution")

    plt.tight_layout()
    plt.savefig(f"freq_aware_results/error_dist_{model_name}.png")
    plt.close()


# Modified train_frequency_aware_models function
def train_frequency_aware_models(
    X_train, X_test, Y_train, Y_test, hyperparameters=None, selected_features=None
):
    """
    Train frequency-aware models for each S-parameter with conditional scaling.
    """
    # S-parameter definitions
    s_parameter_models = {
        "S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"],
    }

    # 'S12': ['S_deemb(1,2)_real', 'S_deemb(1,2)_imag']

    # Set default hyperparameters if not provided
    if hyperparameters is None:
        hyperparameters = {
            "hidden_sizes": [64, 128, 256],
            "dropout_rate": 0.2,
            "learning_rate": 0.001,
            "batch_size": 256,
            "epochs": 150,
            "early_stopping_patience": 15,
            "activation": "gelu",
            "lr_scheduler_type": "one_cycle",
        }

    # Filter features if requested
    if selected_features is not None:
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]
        print(f"Using {len(selected_features)} selected features")

    # Check for GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Identify frequency-related features
    freq_indices, other_indices = identify_frequency_features(X_train.columns)

    # Store results and models
    models = {}
    all_results = {}
    all_predictions = {}
    scalers = {}  # Store scalers for each model

    # Record start time
    start_time = time.time()

    # Train a model for each S-parameter
    for model_name, components in s_parameter_models.items():
        print(f"\n{'=' * 50}")
        print(f"Training frequency-aware model for {model_name}")
        print(f"{'=' * 50}")

        # Decide whether to scale Y data (only for S12)
        scale_y = model_name == "S12"

        # Prepare data with conditional scaling
        prep_results = prepare_data_for_pytorch_with_scaling(
            X_train,
            Y_train,
            X_test,
            Y_test,
            components,
            hyperparameters["batch_size"],
            scale_y=scale_y,
        )

        if scale_y:
            (
                X_train_tensor,
                Y_train_tensor,
                X_test_tensor,
                Y_test_tensor,
                train_loader,
                y_scaler,
            ) = prep_results
            scalers[model_name] = y_scaler
            print("Applied StandardScaler to Y values for S12")
        else:
            (
                X_train_tensor,
                Y_train_tensor,
                X_test_tensor,
                Y_test_tensor,
                train_loader,
                _,
            ) = prep_results

        # Initialize model
        model = FrequencyAwareNetwork(
            len(freq_indices),
            len(other_indices),
            hyperparameters["hidden_sizes"],
            hyperparameters["dropout_rate"],
            hyperparameters.get("activation", "gelu"),
        )
        model.set_feature_indices(freq_indices, other_indices)

        # Loss and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=hyperparameters["learning_rate"])

        # Train model (use your existing train_model function)
        trained_model, train_losses, val_losses = train_model(
            model,
            train_loader,
            X_test_tensor,
            Y_test_tensor,
            criterion,
            optimizer,
            device,
            hyperparameters["epochs"],
            hyperparameters["early_stopping_patience"],
            lr_scheduler_type=hyperparameters.get("lr_scheduler_type", "one_cycle"),
        )

        # Plot learning curves
        plot_learning_curves(train_losses, val_losses, model_name)

        # Evaluate model with proper scaling handling
        metrics, avg_metrics, predictions = evaluate_model_with_scaling(
            trained_model,
            X_test_tensor,
            Y_test_tensor,
            Y_test,
            components,
            device,
            scalers.get(model_name),
        )

        # Plot predictions and error distributions
        plot_predictions(Y_test, predictions, components, model_name)
        plot_error_distribution(Y_test, predictions, components, model_name)

        # Print results
        print(f"\nPerformance metrics for {model_name}:")
        for component, metric in metrics.items():
            print(f"  {component}:")
            print(f"    RMSE: {metric['rmse']:.6f}")
            print(f"    R²: {metric['r2']:.6f}")
            print(f"    MAE: {metric['mae']:.6f}")
            if "smape" in metric:
                print(f"    SMAPE: {metric['smape']:.2f}%")
            else:
                print(f"    MAPE: {metric['mape']:.2f}%")

        print(f"\nAverage metrics for {model_name}:")
        print(f"  R²: {avg_metrics['r2']:.6f}")
        print(f"  RMSE: {avg_metrics['rmse']:.6f}")
        print(f"  MAE: {avg_metrics['mae']:.6f}")
        if "smape" in avg_metrics:
            print(f"  SMAPE: {avg_metrics['smape']:.2f}%")
        else:
            print(f"  MAPE: {avg_metrics['mape']:.2f}%")

        # Store results
        models[model_name] = trained_model
        all_results[model_name] = {
            "component_metrics": metrics,
            "avg_metrics": avg_metrics,
        }
        all_predictions[model_name] = predictions

    # Record total training time
    train_time = time.time() - start_time
    print(f"\nTotal training time: {train_time:.2f} seconds")

    # Save models
    for model_name, model in models.items():
        torch.save(model.state_dict(), f"freq_aware_results/{model_name}_model.pth")

    print("Models and results saved to freq_aware_results/")

    return models, all_results, all_predictions, scalers


# Function to experiment with different hyperparameters
def hyperparameter_tuning(X_train, X_test, Y_train, Y_test, param_grid):
    """
    Perform hyperparameter tuning by training models with different configurations.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Preprocessed feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    param_grid : dict
        Dictionary of hyperparameter values to try

    Returns:
    --------
    results : dict
        Dictionary of results for each configuration
    """
    results = {}

    # Generate all hyperparameter combinations
    param_keys = list(param_grid.keys())
    param_values = list(param_grid.values())

    def generate_combinations(index, current_params):
        if index == len(param_keys):
            # Train model with current parameter combination
            config_name = "_".join([f"{k}={v}" for k, v in current_params.items()])
            print(f"\n\n{'#' * 70}")
            print(f"# Testing configuration: {config_name}")
            print(f"{'#' * 70}\n")

            # Train models
            _, all_results, _ = train_frequency_aware_models(
                X_train, X_test, Y_train, Y_test, hyperparameters=current_params
            )

            # Store results
            avg_r2 = np.mean(
                [result["avg_metrics"]["r2"] for result in all_results.values()]
            )
            results[config_name] = {
                "params": current_params.copy(),
                "avg_r2": avg_r2,
                "detailed_results": all_results,
            }
            return

        # Recursive exploration of parameter combinations
        for value in param_values[index]:
            current_params[param_keys[index]] = value
            generate_combinations(index + 1, current_params)

    # Start generating combinations
    generate_combinations(0, {})

    # Rank results
    ranked_results = sorted(results.items(), key=lambda x: x[1]["avg_r2"], reverse=True)

    # Print summary
    print("\n\n" + "=" * 80)
    print("HYPERPARAMETER TUNING RESULTS")
    print("=" * 80)

    for i, (config_name, result) in enumerate(ranked_results):
        print(f"\n{i + 1}. Configuration: {config_name}")
        print(f"   Average R²: {result['avg_r2']:.6f}")
        print(f"   Parameters: {result['params']}")

    return results


# Function to test different feature subsets
def feature_selection_experiment(X_train, X_test, Y_train, Y_test, feature_sets):
    """
    Test different feature subsets to find optimal combinations.

    Parameters:
    -----------
    X_train, X_test : pd.DataFrame
        Complete feature datasets
    Y_train, Y_test : pd.DataFrame
        Target S-parameter datasets
    feature_sets : dict
        Dictionary mapping set names to lists of feature columns

    Returns:
    --------
    results : dict
        Dictionary of results for each feature set
    """
    results = {}

    for set_name, features in feature_sets.items():
        print(f"\n\n{'#' * 70}")
        print(f"# Testing feature set: {set_name} ({len(features)} features)")
        print(f"{'#' * 70}\n")

        # Train models with this feature set
        _, all_results, _ = train_frequency_aware_models(
            X_train, X_test, Y_train, Y_test, selected_features=features
        )

        # Store results
        avg_r2 = np.mean(
            [result["avg_metrics"]["r2"] for result in all_results.values()]
        )
        results[set_name] = {
            "features": features,
            "feature_count": len(features),
            "avg_r2": avg_r2,
            "detailed_results": all_results,
        }

    # Rank results
    ranked_results = sorted(results.items(), key=lambda x: x[1]["avg_r2"], reverse=True)

    # Print summary
    print("\n\n" + "=" * 80)
    print("FEATURE SELECTION RESULTS")
    print("=" * 80)

    for i, (set_name, result) in enumerate(ranked_results):
        print(f"\n{i + 1}. Feature Set: {set_name}")
        print(f"   Features: {len(result['features'])}")
        print(f"   Average R²: {result['avg_r2']:.6f}")

    return results


# Example usage


# Example of running with all features and default hyperparameters
# models, results, predictions = train_frequency_aware_models(
#     X_train, X_test, Y_raw_train, Y_raw_test,
#     hyperparameters=default_hyperparameters
# )

# Example of hyperparameter tuning
# param_grid = {
#     'learning_rate': [0.0001, 0.001, 0.01],
#     'dropout_rate': [0.1, 0.2, 0.3],
#     'batch_size': [128, 256, 512]
# }
# tuning_results = hyperparameter_tuning(X_train, X_test, Y_raw_train, Y_raw_test, param_grid)

# Example of feature selection experiment
# core_features = ['freq', 'vb', 'vc', 'gm_abs_log']
# freq_features = [col for col in X_train.columns if 'freq' in col]
# impedance_features = [col for col in X_train.columns if 'Zin' in col or 'Zout' in col]

# feature_sets = {
#     'all_features': X_train.columns.tolist(),
#     'frequency_only': freq_features,
#     'core_plus_frequency': core_features + freq_features,
#     'core_plus_impedance': core_features + impedance_features,
#     'optimized_set': ['freq', 'freq_log', 'freq_log_norm', 'vb', 'vc', 'gm_abs_log',
#                       'Zin_real_log', 'Zin_imag_log', 'Zout_real_log']
# }
# feature_results = feature_selection_experiment(X_train, X_test, Y_raw_train, Y_raw_test, feature_sets)

In [None]:
# Test only S12 and S21
selected_s_params = {"S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"]}

# Define custom architectures
custom_architectures = {
    # Basic/Simple
    "simple": [512, 512, 512],
    # Progressive (Increasing)
    "progressive": [128, 256, 512, 1024],
    # Reverse Progressive (Decreasing)
    "reverse_progressive": [1024, 512, 256, 128],
    # Symmetric Pyramid
    "sym_pyramid": [256, 512, 1024, 512, 256],
    # Asymmetric Pyramid
    "asym_pyramid": [256, 512, 1024, 512, 256, 128],
    # Deep Mixed
    "deep_mixed": [256, 512, 256, 1024, 512, 256],
    # V-Shape
    "v_shape": [512, 256, 128, 256, 512],
    # Inverted V-Shape
    "inv_v_shape": [128, 256, 512, 256, 128],
    # Hourglass
    "hourglass": [512, 256, 128, 256, 512],
    # Alternating
    "alternating": [256, 512, 256, 512, 256],
    # Compact
    "compact": [512, 1024, 512],
    # Ultra-Deep
    "ultra_deep": [256, 256, 256, 256, 256, 256],
    # Wide Shallow
    "wide_shallow": [1024, 2048, 1024],
    # Funnel
    "funnel": [1024, 512, 256, 128],
    # Reverse Funnel
    "reverse_funnel": [128, 256, 512, 1024],
    # Diamond
    "diamond": [256, 512, 1024, 512, 256],
    # Step Pattern
    "step": [256, 256, 512, 512, 1024],
    # Zigzag
    "zigzag": [256, 512, 256, 512, 256],
    # Exponential Growth
    "exponential": [128, 256, 512, 1024, 2048],
}

results, best_configs, overall_best = test_all_s_parameters_architectures(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    s_parameters_to_test=selected_s_params,
    architecture_grid=custom_architectures,
)

In [None]:
# Test only S12 and S21
selected_s_params = {"S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"]}

# Define custom architectures
custom_architectures = {
    # Based on the ultra-deep pattern (best MAE)
    "ultra_deep_256": [256, 256, 256, 256, 256, 256],
    "ultra_deep_8": [256, 256, 256, 256, 256, 256, 256, 256],
    "ultra_deep_128": [128, 128, 128, 128, 128, 128, 128, 128],
    "ultra_deep_512": [512, 512, 512, 512, 512, 512],
    # Based on the wide-narrow-wide pattern (strong performers)
    "compact_512": [512, 1024, 512],
    "compact_768": [768, 1536, 768],
    "compact_1024": [1024, 2048, 1024],
    "compact_256": [256, 512, 256],
    # Based on alternating pattern (good performance)
    "alternating_v1": [256, 512, 256, 512, 256],
    "alternating_v2": [128, 256, 128, 256, 128],
    "alternating_deep": [256, 512, 256, 512, 256, 512, 256],
    "alternating_wide": [512, 1024, 512, 1024, 512],
    # Progressive increases (successful in top 5)
    "progressive_fine": [128, 192, 256, 320, 384, 448, 512],
    "progressive_quad": [256, 512, 1024, 2048],
    "progressive_exp": [64, 128, 256, 512, 1024],
    # Decreasing pattern variations (good performance)
    "decrease_fine": [1024, 896, 768, 640, 512, 384, 256],
    "decrease_quad": [2048, 1024, 512, 256],
    "decrease_exp": [1024, 512, 256, 128, 64],
    # Hybrid patterns (combining best elements)
    "hybrid_1": [256, 512, 1024, 512, 256],  # Like Config 4
    "hybrid_2": [512, 1024, 2048, 1024, 512],
    "hybrid_3": [128, 256, 512, 1024, 512, 256, 128],
    # Bottleneck variations
    "bottleneck_32": [256, 128, 64, 32, 64, 128, 256],
    "bottleneck_16": [512, 256, 128, 64, 32, 16, 32, 64, 128, 256, 512],
    "bottleneck_64": [512, 256, 128, 64, 128, 256, 512],
    # Plateau patterns (inspired by top performers)
    "plateau_256": [128, 256, 256, 256, 256, 128],
    "plateau_512": [256, 512, 512, 512, 512, 256],
    "plateau_1024": [512, 1024, 1024, 1024, 1024, 512],
    # Fine-grained ultra-deep
    "ultra_deep_fine_1": [192, 192, 192, 192, 192, 192, 192, 192],
    "ultra_deep_fine_2": [384, 384, 384, 384, 384, 384],
    "ultra_deep_fine_3": [448, 448, 448, 448, 448, 448],
    # Similar to best performers but with slight modifications
    "modified_config_12": [256, 256, 256, 256, 256, 256, 256],
    "modified_config_13": [1024, 1536, 2048, 1536, 1024],
    "modified_config_18": [256, 512, 256, 512, 256, 512],
}

results, best_configs, overall_best = test_all_s_parameters_architectures(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    s_parameters_to_test=selected_s_params,
    architecture_grid=custom_architectures,
)

In [None]:
# Define custom architectures
custom_architectures = {
    "custom_1": [512, 1024, 2048, 1024, 512],
    "custom_2": [128, 256, 512, 1024, 512, 256],
    "custom_3": [384, 768, 384],
}

results, best_configs, overall_best = test_all_s_parameters_architectures(
    X_train, X_test, Y_raw_train, Y_raw_test, architecture_grid=custom_architectures
)

In [None]:
# Example configuration
best_hyperparameters = {
    "hidden_sizes": [384, 768, 1536, 768, 384],
    "dropout_rate": 0.1,
    "learning_rate": 0.002,
    "batch_size": 512,
    "epochs": 300,
    "early_stopping_patience": 40,
    "activation": "gelu",
    "lr_scheduler_type": "reduce_on_plateau",
}

# Configure which S-parameters to train
s_parameters_to_train = {
    "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
}

# Train models with reproducible results
models, results, predictions, scalers = train_frequency_aware_models(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    hyperparameters=best_hyperparameters,
    s_parameters=s_parameters_to_train,
    seed=42,  # Ensures reproducible results
)

In [None]:
# Example configuration
best_hyperparameters = {
    "hidden_sizes": [384, 768, 1536, 768, 384],
    "dropout_rate": 0.1,
    "learning_rate": 0.002,
    "batch_size": 512,
    "epochs": 300,
    "early_stopping_patience": 40,
    "activation": "gelu",
    "lr_scheduler_type": "reduce_on_plateau",
}

# Configure which S-parameters to train
s_parameters_to_train = {
    "S12": ["S_deemb(1,2)_real", "S_deemb(1,2)_imag"],
}

# Train models with reproducible results
models, results, predictions, scalers = train_frequency_aware_models(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    hyperparameters=best_hyperparameters,
    s_parameters=s_parameters_to_train,
    seed=42,  # Ensures reproducible results
)

In [None]:
# Example configuration
best_hyperparameters = {
    "hidden_sizes": [256, 512, 1024, 512, 256],
    "dropout_rate": 0.1,
    "learning_rate": 0.002,
    "batch_size": 512,
    "epochs": 300,
    "early_stopping_patience": 40,
    "activation": "gelu",
    "lr_scheduler_type": "reduce_on_plateau",
}

# Configure which S-parameters to train
s_parameters_to_train = {
    "S22": ["S_deemb(2,2)_real", "S_deemb(2,2)_imag"],
}

# Train models with reproducible results
models, results, predictions, scalers = train_frequency_aware_models(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    hyperparameters=best_hyperparameters,
    s_parameters=s_parameters_to_train,
    seed=42,  # Ensures reproducible results
)

In [None]:
# Example configuration
best_hyperparameters = {
    "hidden_sizes": [256, 512, 1024, 512, 256],
    "dropout_rate": 0.1,
    "learning_rate": 0.002,
    "batch_size": 512,
    "epochs": 300,
    "early_stopping_patience": 40,
    "activation": "gelu",
    "lr_scheduler_type": "reduce_on_plateau",
}

# Configure which S-parameters to train
s_parameters_to_train = {
    "S21": ["S_deemb(2,1)_real", "S_deemb(2,1)_imag"],
}

# Train models with reproducible results
models, results, predictions, scalers = train_frequency_aware_models(
    X_train,
    X_test,
    Y_raw_train,
    Y_raw_test,
    hyperparameters=best_hyperparameters,
    s_parameters=s_parameters_to_train,
    seed=42,  # Ensures reproducible results
)