In [None]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, cohen_kappa_score

In [None]:
# Step 2: Load and preview the dataset
# Replace 'ASD_Traits_Study_Data.csv' with the actual dataset file name
data_file = 'ASD_Traits_Study_Data.csv'  # Ensure this file is in the working directory
df = pd.read_csv(data_file)

# Display the first few rows of the dataset for verification
print("Dataset Preview:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
df.info()

# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

# Confirm the number of rows and columns
print(f"\nDataset Shape: {df.shape}")

In [None]:
# Step 3: Define the target variable and feature subsets (cases)
target = 'ASD_traits'

# Feature subsets for the ablation study
feature_cases = {
    "Minimal Feature Set": ['SRS', 'CARS', 'AQ10'],
    "Binary Diagnostic Variables": ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'],
    "Behavioral and Diagnostic Scores": ['SRS', 'CARS', 'AQ10'],
    "Demographic Features Only": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater'],
    "Combination of Key Groups": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater', 'SRS', 'CARS', 'AQ10'],
    "Full Feature Set": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater', 'SRS', 'CARS',
                         'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'AQ10']
}

In [None]:
# Step 4: Split the dataset into training and testing sets (70:30 split with stratification)
X = df.drop(columns=[target])
y = df[target]

# Perform stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [None]:
# Step 5: Define the function for model training and evaluation
def evaluate_model(X_train, X_test, y_train, y_test, cat_features=None):
    """
    Function to train and evaluate a CatBoost model using 5-fold cross-validation for grid search.
    """
    # Define hyperparameter grid for CatBoost
    grid = {
        'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]
    }
    
    # Initialize StratifiedKFold for 5-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Initialize variables to track the best model and its performance
    best_model = None
    best_params = None
    best_score = -np.inf
    
    # Perform 5-fold cross-validation on the training set
    for train_idx, valid_idx in kf.split(X_train, y_train):
        # Split training data into train and validation subsets for the current fold
        X_train_fold, X_valid_fold = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        # Initialize CatBoostClassifier
        model = CatBoostClassifier(cat_features=cat_features, random_state=42, verbose=0)
        
        # Perform grid search for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid=grid, scoring='f1', cv=5, n_jobs=-1, verbose=0)
        grid_search.fit(X_train_fold, y_train_fold)
        
        # Get the best model and evaluate it on the validation set
        fold_best_model = grid_search.best_estimator_
        y_valid_pred = fold_best_model.predict(X_valid_fold)
        fold_f1 = f1_score(y_valid_fold, y_valid_pred)
        
        # Update the best model if this fold's F1-score is the highest
        if fold_f1 > best_score:
            best_model = fold_best_model
            best_params = grid_search.best_params_
            best_score = fold_f1
    
    # Evaluate the best model on the test set
    y_test_pred = best_model.predict(X_test)
    y_test_prob = best_model.predict_proba(X_test)[:, 1]
    metrics = {
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred),
        'F1-Score': f1_score(y_test, y_test_pred),
        'ROC-AUC': roc_auc_score(y_test, y_test_prob),
        'MCC': matthews_corrcoef(y_test, y_test_pred),
        'Cohen Kappa': cohen_kappa_score(y_test, y_test_pred),
        'Best Params': best_params
    }
    return metrics

In [None]:
# Step 6: Perform the ablation study
results = {}  # Dictionary to store results for each case
for case_name, features in feature_cases.items():
    print(f"Processing: {case_name}")
    # Subset the training and testing data based on the current feature set
    X_train_case = X_train[features]
    X_test_case = X_test[features]
    # Identify categorical features for CatBoost
    cat_features = [col for col in features if col in X.select_dtypes(include=[object, 'category']).columns]
    # Evaluate the model for the current case
    metrics = evaluate_model(X_train_case, X_test_case, y_train, y_test, cat_features=cat_features)
    results[case_name] = metrics
    print(f"Results for {case_name}: {metrics}")

In [None]:
# Step 7: Summarize the results in a DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Step 8: Save the results to a CSV file for analysis
results_df.to_csv('ablation_study_results.csv', index=True)