In [1]:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os
    from datetime import (datetime)
    import json
    import multiprocessing
    #import sys
    import itertools
    import time
    
    import numpy as np

    import concurrent.futures
    from sklearn.model_selection import KFold
    from permutation_weighting.estimator import PW
    
    # Ensure the examples directory is in the path
    #sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    
    # Import from permutation_weighting
    from permutation_weighting.estimator import PW
    
    # Import utilities from examples directory
    from data_utils import generate_kang_schafer_data, evaluate_ate_binary, evaluate_dose_response_continuous
    from plots import generate_model_performance_plots, improved_final_model_evaluation, create_timestamped_directory
    
    # Create output directory
    output_dir = create_timestamped_directory(base_dir="tuning_results", prefix="hyperparam")
    print(f"Results will be saved to: {output_dir}")
    

Results will be saved to: tuning_results/hyperparam_20250318_145409


In [2]:


# Define evaluate_params as a standalone function outside any other function
def evaluate_params(params_tuple, X, A, df, cv_splits, method, n_replications, 
                   use_sgd, use_torch, true_ate_value, continuous_treatment, 
                   method_name, param_names, evaluate_ate_binary, evaluate_dose_response_continuous):
    """
    Evaluate a single parameter combination across cross-validation folds
    """
    idx, values = params_tuple
    params = dict(zip(param_names, values))
    
    # Storage for fold results
    fold_results = []
    
    # Run cross-validation
    for fold, (train_idx, test_idx) in enumerate(cv_splits):
        X_train, X_test = X[train_idx], X[test_idx]
        A_train, A_test = A[train_idx], A[test_idx]
        
        # Create train dataframe for evaluation
        df_train = pd.DataFrame({
            **{f'X{i}': X_train[:, i] for i in range(X_train.shape[1])},
            'A': A_train
        })
        
        for col in df.columns:
            if col.startswith('Y') or col == 'true_dr':
                df_train[col] = df.loc[train_idx, col].values
        
        try:
            # Determine batch_size if using SGD or torch
            batch_size = None
            if 'batch_size' in params:
                batch_size = params['batch_size']
            elif use_sgd or use_torch:
                # Use a default batch size appropriate for the dataset size
                batch_size = min(128, max(32, X_train.shape[0] // 10))
            
            # Train PW model
            pw_result = PW(
                A=A_train, 
                X=X_train, 
                classifier=method, 
                classifier_params=params,
                estimand='ATE',
                num_replicates=n_replications,
                use_sgd=use_sgd,
                use_torch=use_torch,
                batch_size=batch_size
            )
            
            # Save in-sample metrics
            in_sample_mse = pw_result['train'].get('MSEEvaluator', np.nan)
            in_sample_logloss = pw_result['train'].get('LogLossEvaluator', np.nan)
            
            # Apply to test set
            eval_data = {'A': A_test, 'X': X_test}
            eval_pw_result = PW(
                A=A_train, 
                X=X_train, 
                classifier=method, 
                classifier_params=params,
                estimand='ATE',
                num_replicates=n_replications,
                eval_data=eval_data,
                use_sgd=use_sgd,
                use_torch=use_torch,
                batch_size=batch_size
            )
            
            # Get out-of-sample metrics
            out_sample_mse = eval_pw_result['eval'].get('MSEEvaluator', np.nan)
            out_sample_logloss = eval_pw_result['eval'].get('LogLossEvaluator', np.nan)
            
            # Calculate error metric
            if continuous_treatment:
                # For continuous treatment
                dr_eval = evaluate_dose_response_continuous(df_train, pw_result['weights'])
                target_error = dr_eval['integrated_bias']
            else:
                # For binary treatment
                est_ate = evaluate_ate_binary(df_train, pw_result['weights'], true_ate_value)
                target_error = abs(est_ate)
            
            # Save result
            fold_results.append({
                'fold': fold,
                'in_sample_mse': in_sample_mse,
                'in_sample_logloss': in_sample_logloss,
                'out_sample_mse': out_sample_mse,
                'out_sample_logloss': out_sample_logloss,
                'target_error': target_error,
                'converged': pw_result.get('convergence_info', {}).get('converged', True)
            })
            
        except Exception as e:
            # Silently continue on error
            pass
    
    # Skip if no valid results
    if not fold_results:
        return None
    
    # Average results across folds
    mean_results = {
        k: np.mean([r[k] for r in fold_results if k in r and not np.isnan(r[k])]) 
        for k in ['in_sample_mse', 'in_sample_logloss', 'out_sample_mse', 
                  'out_sample_logloss', 'target_error']
    }
    
    # Calculate standard errors
    std_results = {
        f"{k}_std": np.std([r[k] for r in fold_results if k in r and not np.isnan(r[k])]) / np.sqrt(len(fold_results))
        for k in ['in_sample_mse', 'in_sample_logloss', 'out_sample_mse', 
                 'out_sample_logloss', 'target_error']
    }
    
    # Save parameters
    result = {**params, **mean_results, **std_results, 'method': method_name, 'params_idx': idx}
    
    # Convert any lists or tuples in params to strings for safe storage
    for k, v in params.items():
        if isinstance(v, (list, tuple)):
            result[k] = str(v)
    
    return result



In [3]:
# Modified cross_validate_hyperparams function that correctly passes parameters to evaluate_params
def cross_validate_hyperparams(df, method, param_grid, method_name, n_folds=5, 
                             use_sgd=False, use_torch=False, 
                             n_replications=1, seed=42, verbose=True,
                             max_workers=8, true_ate_value=None):
    """
    Perform cross-validation to tune hyperparameters with parallel processing
    
    Parameters:
    -----------
    df: pd.DataFrame
        Data frame with columns: X1-X4, A, Y, Y1, Y0 (for binary) or true_dr (for continuous)
    method: str
        Classification method ('logit', 'boosting', 'neural_net', etc.)
    param_grid: dict
        Dictionary mapping parameter names to lists of values to try
    method_name: str
        Name to identify this method in results
    n_folds: int
        Number of cross-validation folds
    use_sgd: bool
        Whether to use SGD-based training
    use_torch: bool
        Whether to use PyTorch-based training
    n_replications: int
        Number of permutation replications for PW
    seed: int
        Random seed for reproducibility
    verbose: bool
        Whether to print progress information
    max_workers: int
        Maximum number of parallel workers
    true_ate_value: float
        True ATE value for evaluation (if None, will be calculated from data)
        
    Returns:
    --------
    results: pd.DataFrame
        Data frame with results of cross-validation
    """
    from data_utils import evaluate_ate_binary, evaluate_dose_response_continuous
    
    # Extract features and target
    feature_cols = [col for col in df.columns if col.startswith('X') and not col.endswith('_mis')]
    X = df[feature_cols].values
    A = df['A'].values
    
    # Determine if treatment is continuous
    continuous_treatment = 'true_dr' in df.columns
    
    # Set up cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    cv_splits = list(kf.split(X))
    
    # Set true target value for binary treatment
    if not continuous_treatment:
        if true_ate_value is None:
            # Calculate from potential outcomes if available
            if 'Y1' in df.columns and 'Y0' in df.columns:
                true_ate_value = np.mean(df['Y1'] - df['Y0'])
            else:
                true_ate_value = 1.0  # Default value for Kang-Schafer simulation
        
        if verbose:
            print(f"Using True ATE: {true_ate_value:.4f}")
    else:
        true_ate_value = None
    
    # Generate all parameter combinations
    param_names = list(param_grid.keys())
    param_values = list(itertools.product(*param_grid.values()))
    
    # Progress tracking
    total_combos = len(param_values)
    if verbose:
        print(f"Running {total_combos} parameter combinations for {method_name} using up to {max_workers} workers...")
    
    # Track time
    start_time = time.time()
    
    # Wrapper function to make it easier to call evaluate_params with all the needed parameters
    def evaluate_params_wrapper(params_tuple):
        return evaluate_params(
            params_tuple=params_tuple,
            X=X,
            A=A,
            df=df,
            cv_splits=cv_splits,
            method=method,
            n_replications=n_replications,
            use_sgd=use_sgd,
            use_torch=use_torch,
            true_ate_value=true_ate_value,
            continuous_treatment=continuous_treatment,
            method_name=method_name,
            param_names=param_names,
            evaluate_ate_binary=evaluate_ate_binary,
            evaluate_dose_response_continuous=evaluate_dose_response_continuous
        )
    
    # Run evaluations in parallel
    results = []
    
    # Use ThreadPoolExecutor for macOS
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map the parameters directly
        futures = [executor.submit(evaluate_params_wrapper, (i, values)) 
                  for i, values in enumerate(param_values)]
        
        # Process results as they complete
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            if verbose and i % max(1, total_combos // 10) == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{total_combos} combinations - Elapsed: {elapsed:.1f}s")
            
            try:
                result = future.result()
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Error processing result: {e}")
    
    # Create DataFrame from results
    df_results = pd.DataFrame([r for r in results if r is not None])
    
    # Sort by target error
    if not df_results.empty:
        df_results = df_results.sort_values('target_error')
    
    # Print best parameters
    if verbose and not df_results.empty:
        best_idx = df_results['target_error'].idxmin()
        best_params = {k: df_results.loc[best_idx, k] for k in param_names}
        print(f"\nBest parameters for {method_name}:")
        for k, v in best_params.items():
            print(f"  {k}: {v}")
        print(f"Target Error: {df_results.loc[best_idx, 'target_error']:.4f}")
        print(f"Out-of-sample Log Loss: {df_results.loc[best_idx, 'out_sample_logloss']:.4f}")
    
    return df_results

In [4]:
# Main function for hyperparameter tuning notebook
def main_hyperparameter_tuning():
    """
    Main function to run hyperparameter tuning for permutation weighting methods
    """

    # Generate Kang-Schafer data for tuning
    print("Generating Kang-Schafer data...")
    n_samples = 2000  # Adjust based on your needs
    n_folds=5 # Adjust based on your needs
    df_binary = generate_kang_schafer_data(n=n_samples, seed=42, misspecified=False, continuous_treatment=False)
    print(f"Generated binary treatment data with {n_samples} observations")
    
    # Calculate true ATE manually if not available in data_utils
    def calculate_true_ate(df):
        """Calculate true ATE from potential outcomes"""
        if 'Y1' in df.columns and 'Y0' in df.columns:
            return np.mean(df['Y1'] - df['Y0'])
        return 1.0  # Default value for Kang-Schafer simulation
    
    true_ate_value = calculate_true_ate(df_binary)
    print(f"True ATE: {true_ate_value:.4f}")
    
    # Define parameter grids
    # Traditional methods
    logit_params = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l1', 'l2'],
        'solver': ['saga', 'liblinear']
    }
    
    boosting_params = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [2, 3, 4]
    }
    
    # SGD-based methods with batch size for internal mini-batch optimization
    sgd_logit_params = {
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'optimal', 'adaptive'],
        'eta0': [0.001, 0.01, 0.1],
        'batch_size': [32, 64, 128]  # Used for internal mini-batch optimization
    }
    
    # neural_net_params = {
    #     'hidden_layer_sizes': [(32,), (64,), (32, 16)],
    #     'activation': ['relu', 'tanh'],
    #     'alpha': [0.0001, 0.001],
    #     'learning_rate_init': [0.001, 0.01],
    #     'batch_size': [32, 64, 128]  # Used for internal mini-batch optimization
    # }
    
    # Reduced neural net parameters
    neural_net_params = {
        'hidden_layer_sizes': [(32,), (64,)],  # Reduced from 3 to 2 options
        'activation': ['relu'],  # Reduced from 2 to 1 option 
        'alpha': [0.001],  # Reduced from 2 to 1 option
        'learning_rate_init': [0.01],  # Reduced from 2 to 1 option
        'batch_size': [32, 64] ,
        'max_iter': [50],
        'early_stopping': [True],
        'n_iter_no_change': [5]# Reduced from 3 to 2 options
    }
    
    # PyTorch-based methods (if available)
    torch_available = False
    try:
        import torch
        torch_available = True
        
        torch_logistic_params = {
            'learning_rate': [0.01, 0.1], #0.001,
            'l2_reg': [ 0.001, 0.01], #0.0001,
            'batch_size': [ 64, 128], #32,
            'max_iter': [100],# Used for internal mini-batch optimization
        }
        
        torch_mlp_params = {
            'hidden_dims': [[32],[32, 16]], #[64],
            'learning_rate': [0.001, 0.01],
            'l2_reg': [0.0001, 0.001],
            'batch_size': [64, 128], #32, 
            'max_iter': [100],# Used for internal mini-batch optimization
        }
        
        print("PyTorch is available. Will include PyTorch models.")
    except ImportError:
        print("PyTorch not available, skipping PyTorch-based models")
    
  

    
    # Import necessary modules
    import itertools
    import time
    
    # Determine max workers based on system - use fewer than total cores for stability
    max_workers = max(1, min(8, multiprocessing.cpu_count() - 1))
    print(f"Using {max_workers} parallel workers for hyperparameter tuning")
    
    # Run hyperparameter tuning for binary treatment
    print("\n=== Binary Treatment Hyperparameter Tuning ===")
    
    binary_results = {}
    
    # Standard logistic regression
    print("\nTuning Standard Logistic Regression...")
    binary_results['Standard Logistic'] = cross_validate_hyperparams(
        df_binary, 'logit', logit_params, 'Standard Logistic',
        n_folds=n_folds, max_workers=max_workers, true_ate_value=true_ate_value
    )
    
    # Gradient boosting
    print("\nTuning Gradient Boosting...")
    binary_results['Gradient Boosting'] = cross_validate_hyperparams(
        df_binary, 'boosting', boosting_params, 'Gradient Boosting',
        n_folds=n_folds, max_workers=max_workers, true_ate_value=true_ate_value
    )
    
    # SGD logistic regression
    print("\nTuning SGD Logistic Regression...")
    binary_results['SGD Logistic'] = cross_validate_hyperparams(
        df_binary, 'logit', sgd_logit_params, 'SGD Logistic',
        n_folds=n_folds, use_sgd=True, max_workers=max_workers, true_ate_value=true_ate_value
    )
    
    # Neural network
    print("\nTuning Neural Network...")
    binary_results['Neural Network'] = cross_validate_hyperparams(
        df_binary, 'neural_net', neural_net_params, 'Neural Network',
        n_folds=n_folds, use_sgd=True, max_workers=max_workers, true_ate_value=true_ate_value
    )
    
    # PyTorch models (if available)
    if torch_available:
        print("\nTuning PyTorch Logistic Regression...")
        binary_results['PyTorch Logistic'] = cross_validate_hyperparams(
            df_binary, 'logistic', torch_logistic_params, 'PyTorch Logistic',
            n_folds=n_folds, use_torch=True, max_workers=max_workers, true_ate_value=true_ate_value
        )
        
        print("\nTuning PyTorch MLP...")
        binary_results['PyTorch MLP'] = cross_validate_hyperparams(
            df_binary, 'mlp', torch_mlp_params, 'PyTorch MLP',
            n_folds=n_folds, use_torch=True, max_workers=max_workers, true_ate_value=true_ate_value
        )
    
    # Generate combined performance plots
    # Add this to your main_hyperparameter_tuning function before calling generate_model_performance_plots
    print("\nGenerating performance plots...")
    
    # Rename the column to match what generate_model_performance_plots expects
    for method in binary_results:
        if not binary_results[method].empty and 'target_error' in binary_results[method].columns:
            # Rename 'target_error' to 'ate_error'
            binary_results[method] = binary_results[method].rename(columns={
                'target_error': 'ate_error', 
                'target_error_std': 'ate_error_std'
        })

    try:
        best_configs = generate_model_performance_plots(binary_results, output_dir)
        print(f"Successfully generated performance plots in {output_dir}")
    except Exception as e:
        print(f"Error generating performance plots: {str(e)}")
        # Create a simplified version of the best configs manually
        best_configs = {}
        for method_name, df in binary_results.items():
            if not df.empty:
                error_col = 'ate_error' if 'ate_error' in df.columns else 'target_error'
                if error_col in df.columns:
                    best_idx = df[error_col].idxmin()
                    param_cols = [col for col in df.columns if col not in 
                                 ['method', 'ate_error', 'target_error', 'in_sample_mse', 'in_sample_logloss', 
                                  'out_sample_mse', 'out_sample_logloss', 
                                  'in_sample_mse_std', 'in_sample_logloss_std',
                                  'out_sample_mse_std', 'out_sample_logloss_std', 
                                  'ate_error_std', 'target_error_std']]
                    
                    best_configs[method_name] = {
                        'method': method_name,
                        'params': {col: df.loc[best_idx, col] for col in param_cols},
                        'ate_error': df.loc[best_idx, error_col],
                        'in_sample_logloss': df.loc[best_idx, 'in_sample_logloss'] if 'in_sample_logloss' in df else None,
                        'out_sample_logloss': df.loc[best_idx, 'out_sample_logloss'] if 'out_sample_logloss' in df else None
                    }
    
    # Run final model evaluation with ROC curves
    print("\nRunning final model evaluation...")
    methods_to_eval = []
    for method_name, best_config in best_configs.items():
        # Extract parameters for this method
        params = {k: v for k, v in best_config.items() 
                 if k not in ['ate_error', 'in_sample_logloss', 'out_sample_logloss', 'target_error', 'method']}
        
        # Determine which base method to use
        use_sgd = False
        use_torch = False
        
        if method_name == 'SGD Logistic' or method_name == 'Neural Network':
            use_sgd = True
            base_method = 'logit' if method_name == 'SGD Logistic' else 'neural_net'
        elif method_name == 'PyTorch Logistic' or method_name == 'PyTorch MLP':
            use_torch = True
            base_method = 'logistic' if method_name == 'PyTorch Logistic' else 'mlp'
        elif method_name == 'Gradient Boosting':
            base_method = 'boosting'
        else:  # Standard Logistic
            base_method = 'logit'
        
        methods_to_eval.append((
            method_name,
            {'method': base_method, 'params': params, 'use_sgd': use_sgd, 'use_torch': use_torch}
        ))
    
    # Generate ROC curves
    improved_final_model_evaluation(df_binary, methods_to_eval, output_dir=output_dir)
    
    print(f"\nHyperparameter tuning complete. Results saved to '{output_dir}' directory.")

if __name__ == "__main__":
    main_hyperparameter_tuning()

Generating Kang-Schafer data...
Generated binary treatment data with 2000 observations
True ATE: 0.9484
PyTorch is available. Will include PyTorch models.
Using 8 parallel workers for hyperparameter tuning

=== Binary Treatment Hyperparameter Tuning ===

Tuning Standard Logistic Regression...
Using True ATE: 0.9484
Running 12 parameter combinations for Standard Logistic using up to 8 workers...
Processed 1/12 combinations - Elapsed: 1.9s
Processed 2/12 combinations - Elapsed: 2.0s
Processed 3/12 combinations - Elapsed: 2.0s
Processed 4/12 combinations - Elapsed: 2.0s
Processed 5/12 combinations - Elapsed: 2.2s
Processed 6/12 combinations - Elapsed: 2.2s
Processed 7/12 combinations - Elapsed: 2.4s
Processed 8/12 combinations - Elapsed: 2.5s
Processed 9/12 combinations - Elapsed: 3.0s
Processed 10/12 combinations - Elapsed: 3.1s
Processed 11/12 combinations - Elapsed: 3.1s
Processed 12/12 combinations - Elapsed: 3.1s

Best parameters for Standard Logistic:
  C: 10.0
  penalty: l2
  solve

INFO: Starting ROC curve evaluation for different models
INFO: Processed methods for evaluation:
INFO:   Standard Logistic: logit - {'C': np.float64(10.0), 'penalty': 'l2', 'solver': 'saga', 'max_iter': np.float64(nan), 'random_state': 42}
INFO:   Gradient Boosting: boosting - {'n_estimators': np.float64(200.0), 'learning_rate': np.float64(0.1), 'max_depth': np.float64(2.0), 'random_state': 42}
INFO:   Neural Network: neural_net - {'hidden_layer_sizes': (32,), 'activation': 'relu', 'alpha': np.float64(0.001), 'learning_rate_init': np.float64(0.01), 'batch_size': np.float64(32.0), 'max_iter': np.float64(50.0), 'early_stopping': True, 'n_iter_no_change': np.float64(5.0), 'random_state': 42}
INFO:   PyTorch Logistic: logistic - {'learning_rate': np.float64(0.01), 'batch_size': np.float64(64.0), 'l2_reg': np.float64(0.001), 'hidden_dims': nan, 'random_state': 42}
INFO:   PyTorch MLP: mlp - {'learning_rate': np.float64(0.001), 'batch_size': np.float64(64.0), 'l2_reg': np.float64(0.0001), 'h


Best configuration for each method:
Standard Logistic:
  ATE Error: 2.0644
  In-sample Log Loss: 0.7262
  Out-of-sample Log Loss: 0.7226
  Parameters:
    C: 10.0
    penalty: l2
    solver: saga
    params_idx: 10
    n_estimators: nan
    learning_rate: nan
    max_depth: nan
    hidden_layer_sizes: nan
    activation: nan
    alpha: nan
    learning_rate_init: nan
    batch_size: nan
    max_iter: nan
    early_stopping: nan
    n_iter_no_change: nan
    l2_reg: nan
    hidden_dims: nan

Gradient Boosting:
  ATE Error: 4.5851
  In-sample Log Loss: 0.7105
  Out-of-sample Log Loss: 0.7374
  Parameters:
    C: nan
    penalty: nan
    solver: nan
    params_idx: 15
    n_estimators: 200.0
    learning_rate: 0.1
    max_depth: 2.0
    hidden_layer_sizes: nan
    activation: nan
    alpha: nan
    learning_rate_init: nan
    batch_size: nan
    max_iter: nan
    early_stopping: nan
    n_iter_no_change: nan
    l2_reg: nan
    hidden_dims: nan

Neural Network:
  ATE Error: 16.2980
  In-

  plt.legend(loc="lower right", fontsize=12)
INFO: ROC curves saved to tuning_results/hyperparam_20250318_145409/roc_curves.png



Hyperparameter tuning complete. Results saved to 'tuning_results/hyperparam_20250318_145409' directory.


In [25]:
# Function to evaluate one parameter combination
def evaluate_params(params_tuple):
    idx, values = params_tuple
    params = dict(zip(param_names, values))
    
    # Storage for fold results
    fold_results = []
    
    # Run cross-validation
    for fold, (train_idx, test_idx) in enumerate(cv_splits):
        X_train, X_test = X[train_idx], X[test_idx]
        A_train, A_test = A[train_idx], A[test_idx]
        
        # Create train dataframe for evaluation
        df_train = pd.DataFrame({
            **{f'X{i}': X_train[:, i] for i in range(X_train.shape[1])},
            'A': A_train
        })
        
        for col in df.columns:
            if col.startswith('Y') or col == 'true_dr':
                df_train[col] = df.loc[train_idx, col].values
        
        try:
            # Determine batch_size if using SGD or torch
            batch_size = None
            if 'batch_size' in params:
                batch_size = params['batch_size']
            elif use_sgd or use_torch:
                # Use a default batch size appropriate for the dataset size
                batch_size = min(128, max(32, X_train.shape[0] // 10))
            
            # Train PW model
            pw_result = PW(
                A=A_train, 
                X=X_train, 
                classifier=method, 
                classifier_params=params,
                estimand='ATE',
                num_replicates=n_replications,
                use_sgd=use_sgd,
                use_torch=use_torch,
                batch_size=batch_size
            )
            
            # Save in-sample metrics
            in_sample_mse = pw_result['train'].get('MSEEvaluator', np.nan)
            in_sample_logloss = pw_result['train'].get('LogLossEvaluator', np.nan)
            
            # Apply to test set
            eval_data = {'A': A_test, 'X': X_test}
            eval_pw_result = PW(
                A=A_train, 
                X=X_train, 
                classifier=method, 
                classifier_params=params,
                estimand='ATE',
                num_replicates=n_replications,
                eval_data=eval_data,
                use_sgd=use_sgd,
                use_torch=use_torch,
                batch_size=batch_size
            )
            
            # Get out-of-sample metrics
            out_sample_mse = eval_pw_result['eval'].get('MSEEvaluator', np.nan)
            out_sample_logloss = eval_pw_result['eval'].get('LogLossEvaluator', np.nan)
            
            # Calculate error metric
            if continuous_treatment:
                # For continuous treatment
                dr_eval = evaluate_dose_response_continuous(df_train, pw_result['weights'])
                target_error = dr_eval['integrated_bias']
            else:
                # For binary treatment
                est_ate = evaluate_ate_binary(df_train, pw_result['weights'], true_ate_value)
                target_error = abs(est_ate)
            
            # Save result
            fold_results.append({
                'fold': fold,
                'in_sample_mse': in_sample_mse,
                'in_sample_logloss': in_sample_logloss,
                'out_sample_mse': out_sample_mse,
                'out_sample_logloss': out_sample_logloss,
                'target_error': target_error,
                'converged': pw_result.get('convergence_info', {}).get('converged', True)
            })
            
        except Exception as e:
            # Silently continue on error
            pass
    
    # Skip if no valid results
    if not fold_results:
        return None
    
    # Average results across folds
    mean_results = {
        k: np.mean([r[k] for r in fold_results if k in r and not np.isnan(r[k])]) 
        for k in ['in_sample_mse', 'in_sample_logloss', 'out_sample_mse', 
                  'out_sample_logloss', 'target_error']
    }
    
    # Calculate standard errors
    std_results = {
        f"{k}_std": np.std([r[k] for r in fold_results if k in r and not np.isnan(r[k])]) / np.sqrt(len(fold_results))
        for k in ['in_sample_mse', 'in_sample_logloss', 'out_sample_mse', 
                 'out_sample_logloss', 'target_error']
    }
    
    # Save parameters
    result = {**params, **mean_results, **std_results, 'method': method_name, 'params_idx': idx}
    
    # Convert any lists or tuples in params to strings for safe storage
    for k, v in params.items():
        if isinstance(v, (list, tuple)):
            result[k] = str(v)

    return result

     # Function for cross validation that accepts true_ate_value parameter
def cross_validate_hyperparams(df, method, param_grid, method_name, n_folds=n_folds, 
                             use_sgd=False, use_torch=False, 
                             n_replications=1, seed=42, verbose=True,
                             max_workers=8, true_ate_value=None):
    """
    Perform cross-validation to tune hyperparameters with parallel processing
    
    Parameters:
    -----------
    df: pd.DataFrame
        Data frame with columns: X1-X4, A, Y, Y1, Y0 (for binary) or true_dr (for continuous)
    method: str
        Classification method ('logit', 'boosting', 'neural_net', etc.)
    param_grid: dict
        Dictionary mapping parameter names to lists of values to try
    method_name: str
        Name to identify this method in results
    n_folds: int
        Number of cross-validation folds
    use_sgd: bool
        Whether to use SGD-based training
    use_torch: bool
        Whether to use PyTorch-based training
    n_replications: int
        Number of permutation replications for PW
    seed: int
        Random seed for reproducibility
    verbose: bool
        Whether to print progress information
    max_workers: int
        Maximum number of parallel workers
    true_ate_value: float
        True ATE value for evaluation (if None, will be calculated from data)
        
    Returns:
    --------
    results: pd.DataFrame
        Data frame with results of cross-validation
    """
    import concurrent.futures
    
    # Extract features and target
    feature_cols = [col for col in df.columns if col.startswith('X') and not col.endswith('_mis')]
    X = df[feature_cols].values
    A = df['A'].values
    
    # Determine if treatment is continuous
    
    continuous_treatment = 'true_dr' in df.columns
    
    # Set up cross-validation
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    cv_splits = list(kf.split(X))
    
    # Set true target value for binary treatment
    if not continuous_treatment:
        if true_ate_value is None:
            # Calculate from potential outcomes if available
            if 'Y1' in df.columns and 'Y0' in df.columns:
                true_ate_value = np.mean(df['Y1'] - df['Y0'])
            else:
                true_ate_value = 1.0  # Default value for Kang-Schafer simulation
        
        if verbose:
            print(f"Using True ATE: {true_ate_value:.4f}")
    else:
        true_ate_value = None
    
    # Generate all parameter combinations
    param_names = list(param_grid.keys())
    param_values = list(itertools.product(*param_grid.values()))
    
    # Progress tracking
    total_combos = len(param_values)
    if verbose:
        print(f"Running {total_combos} parameter combinations for {method_name} using up to {max_workers} workers...")
    
    # Track time
    start_time = time.time()
    
    
    # Run evaluations in parallel
    results = []
    
    # Use ThreadPoolExecutor for better performance with numerical computations
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(evaluate_params, (i, values)) for i, values in enumerate(param_values)]
        
        # Process results as they complete
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            if verbose and i % max(1, total_combos // 10) == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{total_combos} combinations - Elapsed: {elapsed:.1f}s")
            
            result = future.result()
            if result is not None:
                results.append(result)
    
    # Create DataFrame from results
    df_results = pd.DataFrame([r for r in results if r is not None])
    
    # Sort by target error
    if not df_results.empty:
        df_results = df_results.sort_values('target_error')
    
    # Print best parameters
    if verbose and not df_results.empty:
        best_idx = df_results['target_error'].idxmin()
        best_params = {k: df_results.loc[best_idx, k] for k in param_names}
        print(f"\nBest parameters for {method_name}:")
        for k, v in best_params.items():
            print(f"  {k}: {v}")
        print(f"Target Error: {df_results.loc[best_idx, 'target_error']:.4f}")
        print(f"Out-of-sample Log Loss: {df_results.loc[best_idx, 'out_sample_logloss']:.4f}")
    
    return df_results

In [ ]:
def cross_validate_hyperparams(df, method, param_grid, method_name, n_folds=5, 
                             use_sgd=False, use_torch=False, 
                             n_replications=1, seed=42, verbose=True,
                             max_workers=8, true_ate_value=None):
    """
    Perform cross-validation to tune hyperparameters with parallel processing
    
    Parameters:
    -----------
    df: pd.DataFrame
        Data frame with columns: X1-X4, A, Y, Y1, Y0 (for binary) or true_dr (for continuous)
    method: str
        Classification method ('logit', 'boosting', 'neural_net', etc.)
    param_grid: dict
        Dictionary mapping parameter names to lists of values to try
    method_name: str
        Name to identify this method in results
    n_folds: int
        Number of cross-validation folds
    use_sgd: bool
        Whether to use SGD-based training
    use_torch: bool
        Whether to use PyTorch-based training
    n_replications: int
        Number of permutation replications for PW
    seed: int
        Random seed for reproducibility
    verbose: bool
        Whether to print progress information
    max_workers: int
        Maximum number of parallel workers
    true_ate_value: float
        True ATE value for evaluation (if None, will be calculated from data)
        
    Returns:
    --------
    results: pd.DataFrame
        Data frame with results of cross-validation
    """
    import concurrent.futures
    
    # Extract features and target
    feature_cols = [col for col in df.columns if col.startswith('X') and not col.endswith('_mis')]
    X = df[feature_cols].values
    A = df['A'].values
    
    # Determine if treatment is continuous
    continuous_treatment = 'true_dr' in df.columns
    
    # Set up cross-validation
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    cv_splits = list(kf.split(X))
    
    # Set true target value for binary treatment
    if not continuous_treatment:
        if true_ate_value is None:
            # Calculate from potential outcomes if available
            if 'Y1' in df.columns and 'Y0' in df.columns:
                true_ate_value = np.mean(df['Y1'] - df['Y0'])
            else:
                true_ate_value = 1.0  # Default value for Kang-Schafer simulation
        
        if verbose:
            print(f"Using True ATE: {true_ate_value:.4f}")
    else:
        true_ate_value = None
    
    # Generate all parameter combinations
    param_names = list(param_grid.keys())
    param_values = list(itertools.product(*param_grid.values()))
    
    # Progress tracking
    total_combos = len(param_values)
    if verbose:
        print(f"Running {total_combos} parameter combinations for {method_name} using up to {max_workers} workers...")
    
    # Track time
    start_time = time.time()
    

    
    # Run evaluations in parallel
    results = []
    
    # Use ProcessPoolExecutor for better performance with numerical computations
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(evaluate_params, (i, values)) for i, values in enumerate(param_values)]
        
        # Process results as they complete
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            if verbose and i % max(1, total_combos // 10) == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{total_combos} combinations - Elapsed: {elapsed:.1f}s")
            
            result = future.result()
            if result is not None:
                results.append(result)
    
    # Create DataFrame from results
    df_results = pd.DataFrame([r for r in results if r is not None])
    
    # Sort by target error
    if not df_results.empty:
        df_results = df_results.sort_values('target_error')
    
    # Print best parameters
    if verbose and not df_results.empty:
        best_idx = df_results['target_error'].idxmin()
        best_params = {k: df_results.loc[best_idx, k] for k in param_names}
        print(f"\nBest parameters for {method_name}:")
        for k, v in best_params.items():
            print(f"  {k}: {v}")
        print(f"Target Error: {df_results.loc[best_idx, 'target_error']:.4f}")
        print(f"Out-of-sample Log Loss: {df_results.loc[best_idx, 'out_sample_logloss']:.4f}")
    
    return df_results

In [ ]:
# Updated cross_validate_hyperparams function 
def cross_validate_hyperparams(df, method, param_grid, method_name, n_folds=5, 
                             use_sgd=False, use_torch=False, 
                             n_replications=1, seed=42, verbose=True,
                             max_workers=8, true_ate_value=None):
    """
    Perform cross-validation to tune hyperparameters with parallel processing
    
    Parameters:
    -----------
    df: pd.DataFrame
        Data frame with columns: X1-X4, A, Y, Y1, Y0 (for binary) or true_dr (for continuous)
    method: str
        Classification method ('logit', 'boosting', 'neural_net', etc.)
    param_grid: dict
        Dictionary mapping parameter names to lists of values to try
    method_name: str
        Name to identify this method in results
    n_folds: int
        Number of cross-validation folds
    use_sgd: bool
        Whether to use SGD-based training
    use_torch: bool
        Whether to use PyTorch-based training
    n_replications: int
        Number of permutation replications for PW
    seed: int
        Random seed for reproducibility
    verbose: bool
        Whether to print progress information
    max_workers: int
        Maximum number of parallel workers
    true_ate_value: float
        True ATE value for evaluation (if None, will be calculated from data)
        
    Returns:
    --------
    results: pd.DataFrame
        Data frame with results of cross-validation
    """
    from data_utils import evaluate_ate_binary, evaluate_dose_response_continuous
    
    # Extract features and target
    feature_cols = [col for col in df.columns if col.startswith('X') and not col.endswith('_mis')]
    X = df[feature_cols].values
    A = df['A'].values
    
    # Determine if treatment is continuous
    continuous_treatment = 'true_dr' in df.columns
    
    # Set up cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    cv_splits = list(kf.split(X))
    
    # Set true target value for binary treatment
    if not continuous_treatment:
        if true_ate_value is None:
            # Calculate from potential outcomes if available
            if 'Y1' in df.columns and 'Y0' in df.columns:
                true_ate_value = np.mean(df['Y1'] - df['Y0'])
            else:
                true_ate_value = 1.0  # Default value for Kang-Schafer simulation
        
        if verbose:
            print(f"Using True ATE: {true_ate_value:.4f}")
    else:
        true_ate_value = None
    
    # Generate all parameter combinations
    param_names = list(param_grid.keys())
    param_values = list(itertools.product(*param_grid.values()))
    
    # Progress tracking
    total_combos = len(param_values)
    if verbose:
        print(f"Running {total_combos} parameter combinations for {method_name} using up to {max_workers} workers...")
    
    # Track time
    start_time = time.time()
    
    # Run evaluations in parallel
    results = []
    
    # Instead of using ProcessPoolExecutor, use ThreadPoolExecutor for Mac
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        # Submit tasks
        for i, values in enumerate(param_values):
            # Note we're passing all the needed parameters explicitly
            futures.append(executor.submit(
                evaluate_params,
                (i, values),
                X=X,
                A=A,
                df=df,
                cv_splits=cv_splits,
                method=method,
                n_replications=n_replications,
                use_sgd=use_sgd,
                use_torch=use_torch,
                true_ate_value=true_ate_value,
                continuous_treatment=continuous_treatment,
                method_name=method_name,
                param_names=param_names,
                evaluate_ate_binary=evaluate_ate_binary,
                evaluate_dose_response_continuous=evaluate_dose_response_continuous
            ))
        
        # Process results as they complete
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            if verbose and i % max(1, total_combos // 10) == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{total_combos} combinations - Elapsed: {elapsed:.1f}s")
            
            try:
                result = future.result()
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Error processing result: {e}")
    
    # Create DataFrame from results
    df_results = pd.DataFrame([r for r in results if r is not None])
    
    # Sort by target error
    if not df_results.empty:
        df_results = df_results.sort_values('target_error')
    
    # Print best parameters
    if verbose and not df_results.empty:
        best_idx = df_results['target_error'].idxmin()
        best_params = {k: df_results.loc[best_idx, k] for k in param_names}
        print(f"\nBest parameters for {method_name}:")
        for k, v in best_params.items():
            print(f"  {k}: {v}")
        print(f"Target Error: {df_results.loc[best_idx, 'target_error']:.4f}")
        print(f"Out-of-sample Log Loss: {df_results.loc[best_idx, 'out_sample_logloss']:.4f}")
    
    return df_results

# You can then use this in your main_hyperparameter_tuning function
# Just change concurrent.futures.ThreadPoolExecutor with ThreadPoolExecutor