In [None]:
# Script to evaluate the best model using saved CSV files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Define feature names (as used in the original code)
names_TMs = ['CA125', 'CA15.3', 'CEA', 'Cyfra 21.1', 'HE4', 'NSE', 'proGRP', 'SCC']

problem = 'SCLC'  # Options: 'LC', 'NSCLC', 'SCLC'
if problem == 'LC':
    names_classes = ['No lung cancer', 'Primary lung carcinoma']
    ppv_aim = 0.98
elif problem == 'NSCLC':
    names_classes = ['No lung cancer + SCLC', 'NSCLC']
    ppv_aim = 0.95
elif problem == 'SCLC':
    names_classes = ['No lung cancer + NSCLC', 'SCLC']
    ppv_aim = 0.95


pre = problem.lower()
# Function to parse median and IQR from string format "median (IQR_low-IQR_high)"
def parse_metric(metric_str):
    match = re.match(r"(\d+\.\d+)\s*\((\d+\.\d+)-(\d+\.\d+)\)", metric_str)
    if match:
        median = float(match.group(1))
        iqr_low = float(match.group(2))
        iqr_high = float(match.group(3))
        return median, iqr_low, iqr_high
    return np.nan, np.nan, np.nan

# Function to select the best model
def select_best_model(performance_metrics_table, cv_folds_threshold=80.0, ppv_min=0.98, sens_min=0.70):
    """
    Select the best model from performance_metrics_table.csv.
    Parameters:
    - performance_metrics_table: DataFrame with performance metrics
    - cv_folds_threshold: Minimum % of CV folds meeting PPV criteria
    - ppv_min: Minimum acceptable PPV
    - sens_min: Minimum acceptable sensitivity
    Returns:
    - best_n_features: Number of features for the best model
    - best_metrics: Metrics for the best model
    """
    # Parse median values for PPV and Sensitivity
    performance_metrics_table['PPV_median'] = performance_metrics_table['PPV val'].apply(
        lambda x: parse_metric(x)[0]
    )
    performance_metrics_table['Sens_median'] = performance_metrics_table['Sens val'].apply(
        lambda x: parse_metric(x)[0]
    )

    # Filter models meeting CV folds, PPV, and sensitivity criteria
    valid_models = performance_metrics_table[
        (performance_metrics_table['CV folds PPV criteria'] >= cv_folds_threshold) &
        (performance_metrics_table['PPV_median'] >= ppv_min) &
        (performance_metrics_table['Sens_median'] >= sens_min)
    ]

    if valid_models.empty:
        print("No models meet the criteria: CV folds >= {:.1f}%, PPV >= {:.2f}, Sensitivity >= {:.2f}.".format(
            cv_folds_threshold, ppv_min, sens_min))
        return None, None

    # Sort by Sensitivity (descending) and Number of features (ascending)
    valid_models = valid_models.sort_values(
        by=['Sens_median', 'Number features'], ascending=[False, True]
    )

    # Select the top model
    best_model = valid_models.iloc[0]
    best_n_features = int(best_model['Number features'])
    print("Best model: {} features".format(best_n_features))
    print("Performance metrics:\n", best_model[['Number features', 'CV folds PPV criteria', 
                                              'Sens val', 'Spec val', 'PPV val', 'NPV val', 'AUC val']])
    
    return best_n_features, best_model

# Load CSV files
try:
    performance_metrics_table = pd.read_csv(f"{pre}performance_metrics_table.csv")
    selected_features_matrix = pd.read_csv(f"{pre}selected_features_matrix.csv", header=None)
except FileNotFoundError as e:
    print(f"Error: {e}. Ensure CSV files are in the current directory.")
    exit()

# Convert selected_features_matrix to numpy array and assign column names
selected_features_matrix = selected_features_matrix.to_numpy()
n_features_to_select = performance_metrics_table['Number features'].values

# Evaluate the best model
ppv_aim = 0.98  # Adjust based on your problem (0.98 for LC, 0.95 for NSCLC/SCLC)
best_n_features, best_metrics = select_best_model(
    performance_metrics_table,
    cv_folds_threshold=80.0,
    ppv_min=ppv_aim,
    sens_min=0.70
)

# If a best model is found, inspect selected features
if best_n_features is not None:
    idx = list(n_features_to_select).index(best_n_features)
    selected_features = selected_features_matrix[:, idx]
    print("\nSelected features (percentage of folds selected):")
    for feature, percentage in zip(names_TMs, selected_features):
        if percentage > 0:
            print(f"{feature}: {percentage:.1f}%")

    # Plot feature selection heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(selected_features_matrix, annot=True, fmt=".0f", cmap="Blues",
                xticklabels=n_features_to_select, yticklabels=names_TMs)
    plt.xlabel('Number of Selected Features')
    plt.ylabel('Input Variables')
    plt.title('Percentage of Folds Where Features Were Selected')
    plt.savefig('feature_selection_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("Heatmap saved to feature_selection_heatmap.png")

    # Plot performance metrics vs. number of features
    plt.figure(figsize=(8, 6))
    sns.lineplot(x='Number features', y=performance_metrics_table['Sens val'].apply(lambda x: parse_metric(x)[0]),
                 data=performance_metrics_table, label='Sensitivity')
    sns.lineplot(x='Number features', y=performance_metrics_table['PPV val'].apply(lambda x: parse_metric(x)[0]),
                 data=performance_metrics_table, label='PPV')
    plt.axvline(x=best_n_features, color='red', linestyle='--', label='Best Model')
    plt.xlabel('Number of Features')
    plt.ylabel('Metric Value')
    plt.title('Model Performance vs. Number of Features')
    plt.legend()
    plt.savefig('performance_plot.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("Performance plot saved to performance_plot.png")

    # Optionally, load predictions for the best model
    try:
        predicted_prob = pd.read_csv(f"predicted_prob_{best_n_features}_features.csv", header=None).to_numpy()
        predicted_class = pd.read_csv(f"predicted_class_{best_n_features}_features.csv", header=None).to_numpy()
        print(f"Loaded predictions for {best_n_features} features.")
        # Example: Compute percentage of samples predicted as class 1
        percentage_class_one = np.nanmean(predicted_class == 1, axis=1) * 100
        print(f"Average percentage of samples predicted as class 1: {np.mean(percentage_class_one):.1f}%")
    except FileNotFoundError:
        print(f"Prediction files for {best_n_features} features not found.")

Error: [Errno 2] No such file or directory: 'performance_metrics_table.csv'. Ensure CSV files are in the current directory.


NameError: name 'selected_features_matrix' is not defined

: 