In [None]:
# Import required libraries
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, cohen_kappa_score
import numpy as np
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('ASD_Traits_Study_Data.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
# Define features (X) and target variable (y)
X = df.drop(columns=['ASD_traits'])
y = df['ASD_traits']

# Identify categorical features
category_cols = X.select_dtypes(include=[object, 'category']).columns.tolist()

# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Verify the shapes of training and testing datasets
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

In [None]:
import matplotlib.pyplot as plt
import shap
import numpy as np

In [None]:
# Train the CatBoost model using the best hyperparameters from earlier
best_model = CatBoostClassifier(
    cat_features=category_cols,
    learning_rate=0.03,  # Replace with the best learning rate from grid search
    depth=4,           # Replace with the best depth from grid search
    l2_leaf_reg=5,     # Replace with the best l2_leaf_reg from grid search
    random_state=42,
    verbose=0
)
best_model.fit(X_train, y_train)

# Check feature importances as reported by CatBoost
catboost_importances = best_model.get_feature_importance(prettified=True)
print(catboost_importances)

In [None]:
# Initialize the SHAP TreeExplainer
explainer = shap.TreeExplainer(best_model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Verify SHAP values shape
print(f"SHAP values shape: {np.array(shap_values).shape}")

In [None]:
# Generate SHAP summary bar plot with annotations
plt.figure(figsize=(10, 8))

# Create the summary plot without showing it yet
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)

# Get feature importances (absolute mean SHAP values per feature)
feature_importances = np.abs(shap_values).mean(0)
sorted_indices = np.argsort(feature_importances)[::-1]  # Sorting by importance

# Retrieve the current plot's axis for annotation
ax = plt.gca()
bars = ax.patches

# Annotate bars with SHAP importance values
for bar, index in zip(bars, sorted_indices):
    width = bar.get_width()
    ax.text(width + 0.02, bar.get_y() + bar.get_height() / 2,  # Position text slightly after the bar ends
            f'{width:.4f}', ha='left', va='center', fontsize=12, color='black')

# Save the SHAP summary plot as a high-resolution image
plt.tight_layout()
plt.savefig("[ASD]_shap_summary_plot_bar_custom_fixed_auto.png", dpi=1000)
plt.close()

In [None]:
# Generate SHAP detailed summary plot
plt.figure(figsize=(10, 8))

# Generate the summary plot with detailed feature value distribution
shap.summary_plot(shap_values, X_test, plot_type="dot", show=False, plot_size=(10, 8))

# Save the detailed SHAP summary plot as a high-resolution image
plt.tight_layout()
plt.savefig("[ASD]_shap_detailed_summary_plot_fixed.png", dpi=1000, bbox_inches="tight")
plt.close()

print("Detailed SHAP summary plot saved successfully!")

In [None]:
# Generate and save SHAP dependence plots
plt.figure(figsize=(8, 6))
shap.dependence_plot('SRS', shap_values, X_test, interaction_index='CARS', show=False)
plt.tight_layout()  # Adjust layout to avoid overlap
plt.savefig("[ASD]_shap_dependence_plot_SRS_CARS.png", dpi=1000, bbox_inches='tight')
plt.close()

plt.figure(figsize=(8, 6))
shap.dependence_plot('CARS', shap_values, X_test, interaction_index='AQ10', show=False)
plt.tight_layout()  # Adjust layout to avoid overlap
plt.savefig("[ASD]_shap_dependence_plot_CARS_AQ10.png", dpi=1000, bbox_inches='tight')
plt.close()

plt.figure(figsize=(8, 6))
shap.dependence_plot('AQ10', shap_values, X_test, interaction_index='SRS', show=False)
plt.tight_layout()  # Adjust layout to avoid overlap
plt.savefig("[ASD]_shap_dependence_plot_AQ10_SRS.png", dpi=1000, bbox_inches='tight')
plt.close()

print("All SHAP dependence plots saved successfully!")

In [None]:
# Create a SHAP explainer object for the trained model
explainer = shap.TreeExplainer(best_model)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

# SHAP values are ready to use for visualizations
print("SHAP values calculated successfully!")

In [None]:
# Generate SHAP Waterfall Plot for the first instance
plt.figure(figsize=(10, 8))
shap.waterfall_plot(shap_values[0], show=False)  # First observation
plt.tight_layout()
plt.savefig("[ASD] shap_waterfall_plot_obs_1.png", dpi=1000)
plt.close()

print("Waterfall plot for the first observation saved as '[ASD] shap_waterfall_plot_obs_1.png'.")

In [None]:
# Generate SHAP Waterfall Plot for a specific observation (e.g., 3rd instance)
plt.figure(figsize=(10, 8))
shap.waterfall_plot(shap_values[10], show=False)  # Third observation
plt.tight_layout()
plt.savefig("[ASD] shap_waterfall_plot_obs_11.png", dpi=1000)
plt.close()

print("Waterfall plot for the third observation saved as '[ASD] shap_waterfall_plot_obs_11.png'.")

In [None]:
# Extract SHAP values as a NumPy array for the decision plot
shap_values_array = shap_values.values

# Generate SHAP Decision Plot
plt.figure(figsize=(10, 8))
shap.decision_plot(explainer.expected_value, shap_values_array, X_test, show=False)
plt.tight_layout()
plt.savefig("[ASD] shap_decision_plot.png", dpi=1000)
plt.close()

print("Decision plot saved as '[ASD] shap_decision_plot.png'.")

In [None]:
# Generate SHAP Heatmap Plot
plt.figure(figsize=(10, 8))
shap.plots.heatmap(shap_values, show=False)
plt.tight_layout()
plt.savefig("[ASD] shap_heatmap_plot.png", dpi=1000)
plt.close()

print("Heatmap plot saved as '[ASD] shap_heatmap_plot.png'.")