In [None]:
# Install PyCaret
!pip install pycaret

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.classification import *
from google.colab import files

In [None]:
# Step 1: Upload and load dataset
print("Please upload your dataset (e.g., 'ASD_Traits_Study_Data.csv').")
uploaded = files.upload()

file_path = list(uploaded.keys())[0]
df = pd.read_csv(file_path)

In [None]:
# Step 2: Define target variable and feature cases
target = 'ASD_traits'
feature_cases = {
    "Minimal Feature Set": ['SRS', 'CARS', 'AQ10'],
    "Binary Diagnostic Variables": ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'],
    "Behavioral and Diagnostic Scores": ['SRS', 'CARS', 'AQ10'],
    "Demographic Features Only": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater'],
    "Combination of Key Groups": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater', 'SRS', 'CARS', 'AQ10'],
    "Full Feature Set": ['Gender', 'Age_Years', 'Ethnicity', 'Family_mem_with_ASD', 'Rater',
                         'SRS', 'CARS', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'AQ10']
}

In [None]:
# Step 3: Split dataset into training and testing sets
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [None]:
# Step 4: Perform PyCaret setup and model comparison for each case
case_results = {}

for case_name, features in feature_cases.items():
    print(f"Processing Case: {case_name}")

    # Filter training data for the current feature set
    case_train_data = train_data[features + [target]]

    # PyCaret setup with consistent session ID
    clf_setup = setup(
        data=case_train_data,
        target=target,
        session_id=42,  # Ensure reproducibility
        fold=5,         # Fivefold cross-validation
        verbose=False
    )

    # Compare models and select the best one
    best_model = compare_models(n_select=1)  # Select only the best model
    print(f"Best Model for {case_name}: {best_model}")

    # Finalize the best model on the entire training data
    final_model = finalize_model(best_model)

    # Evaluate the finalized model on the test set
    case_test_data = test_data[features + [target]]
    predictions = predict_model(final_model, data=case_test_data)

    # Extract true labels and predictions
    y_true = case_test_data[target]
    y_pred = predictions['prediction_label']
    y_prob = predictions['prediction_score']

    # Calculate performance metrics
    metrics = {
        'Best Model': str(best_model),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_prob),
        'MCC': matthews_corrcoef(y_true, y_pred),
        'Cohen Kappa': cohen_kappa_score(y_true, y_pred)
    }

    # Store results for the current case
    case_results[case_name] = metrics

In [None]:
# Step 5: Summarize Results and Save to File
results_df = pd.DataFrame(case_results).T
print("\nSummary of Results Across All Cases:")
print(results_df)

results_df.to_csv('ablation_study_results.csv', index=True)
print("\nResults saved to 'ablation_study_results.csv'.")

In [None]:
# Now, call models() to list all available models
all_models = models()
print(all_models)